Beispiel #1
0
def generate_word_sequence_recognition_wfst_test(n,
                                                 lex,
                                                 original=False,
                                                 weight_fwd=None,
                                                 weight_self=None):
    """ generate a HMM to recognise any single word sequence for words in the lexicon
    
    Args:
        n (int): states per phone HMM
        original (bool): True/False - origianl/optimized lexicon
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
    Returns:
        the constructed WFST
    
    """
    if (weight_fwd != None and weight_self != None):
        f = fst.Fst('log')
        none_weight = fst.Weight('log', -math.log(1))
    else:
        f = fst.Fst()
        none_weight = None

    lex = parse_lexicon(lex, original)

    word_table, phone_table, state_table = generate_symbols_table(lex, 3)
    output_table = generate_output_table(word_table, phone_table)
    #     print('output_table: {}'.format(list(output_table)))
    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)
    # -- make fst
    for word, phone_list in lex.items():
        for phones in phone_list:
            initial_state = f.add_state()
            f.add_arc(
                start_state,
                fst.Arc(0, output_table.find(word), none_weight,
                        initial_state))
            current_state = initial_state
            for phone in phones:
                current_state = generate_phone_wfst(f, current_state, phone, n,
                                                    state_table, output_table,
                                                    weight_fwd, weight_self)
            f.set_final(current_state)


#             f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))

    f.set_input_symbols(state_table)
    f.set_output_symbols(output_table)
    return f, word_table
def create_priors(priors, isym, osym, code):
    """This function creates a linear FST 
    and adds a <sigma> (joker) symbol at the
    end as a place holder"""

    priors = priors.split(";")

    # init a trasducer
    f = fst.Fst()
    f.set_input_symbols(isym)
    f.set_output_symbols(osym)
    s0 = f.add_state()
    f.set_start(s0)
    old = s0
    sig = "<sigma>"
    
    # adding priors
    for j in range(len(priors)):
        new = f.add_state()
        f.add_arc(old, fst.Arc(code[priors[j]], code[priors[j]], fst.Weight(f.weight_type(), 1.0), new))
        old = new
    new = f.add_state()

    # adding <sigma>
    f.add_arc(old, fst.Arc(code[sig], code[sig], fst.Weight(f.weight_type(), 1.0), new))
    f.add_arc(new, fst.Arc(code[sig], code[sig], fst.Weight(f.weight_type(), 1.0), new))
    return f,new
Beispiel #3
0
def OpenFST_Automata_Test(set_src_states, set_dst_states, set_labels):
    f = fst.Fst()

    for i, src in set_src_states:
        for j, label in set_labels[src]:
            for k, dst in set_dst_states[src][labels]:
                print(src, label, dst)
def build_refiner(isyms_fname, refiner_fname):
    """build refiner
    this fst would help extract the 
    last two states (one last arc)
    of the machine
    """

    # read isyms
    input_syms = fst.SymbolTable.read_text(isyms_fname)
    code = {}
    for ltr, c in input_syms:
        code[c]=ltr

    # build refiner
    refiner = fst.Fst()
    refiner.set_input_symbols(input_syms)
    refiner.set_output_symbols(input_syms)
    s0 = refiner.add_state()
    s1 = refiner.add_state()
    for c, ltr in code.items():
        if ltr == 0:
            continue
        if ltr < 100:
            refiner.add_arc(s0, fst.Arc(code[c], code["<epsilon>"], fst.Weight(refiner.weight_type(), 1.0), s0))
        refiner.add_arc(s0, fst.Arc(code[c], code[c], fst.Weight(refiner.weight_type(), 1.0), s1))
    refiner.set_start(s0)
    refiner.set_final(s1)

    # save refiner
    refiner.write(refiner_fname)
Beispiel #5
0
def generate_word_sequence_recognition_wfst(n):
    """ generate a HMM to recognise any single word sequence for words in the lexicon
    
    Args:
        n (int): states per phone HMM

    Returns:
        the constructed WFST
    
    """
    
    f = fst.Fst('log')
    
    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)
    for _, word in word_table:
        if word != '<eps>':
            tmp_state = f.add_state()
            weight = fst.Weight('log', -math.log(word_table.num_symbols()))
            f.add_arc(start_state, fst.Arc(0, 0, weight, tmp_state))
            word_wfst = generate_word_wfst(f, tmp_state, word, n)
            weight = fst.Weight('log', -math.log(1.0))
            f.add_arc(list(word_wfst.states())[-1], fst.Arc(0, 0, weight, start_state))
        
    return f
Beispiel #6
0
def generate_phone_sequence_recognition_wfst(n):
    """ generate a HMM to recognise any single phone sequence in the lexicon
    
    Args:
        n (int): states per phone HMM

    Returns:
        the constructed WFST
    
    """
    
    f = fst.Fst('log')
    
    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)
    for i, phone in phone_table:
        if phone != '<eps>':
            tmp_state = f.add_state()
            weight = fst.Weight('log', -math.log(phone_table.num_symbols()))
            f.add_arc(start_state, fst.Arc(0, 0, weight, tmp_state))
            last_state = generate_phone_wfst(f, tmp_state, phone, n)
            f.set_final(last_state)
            weight = fst.Weight('log', -math.log(1))
            f.add_arc(last_state, fst.Arc(0, 0, weight, start_state))
            
    return f
Beispiel #7
0
    def __init__(self, lex, original=True):

        self.lex = parse_lexicon(lex, False)
        self.word_table, self.phone_table, self.state_table = generate_symbols_table(
            self.lex, 3)
        self.output_table = generate_output_table(self.word_table,
                                                  self.phone_table)

        #         print(f"phone table: {list(self.phone_table)}")
        #         print(f"Output stable: {list(self.output_table)}")
        #         print(f"Word stable: {list(self.word_table)}")
        #         print(f"State Table: {list(self.state_table)}")

        self.f = fst.Fst()

        start_state = self.f.add_state()
        self.f.set_start(start_state)
        self.root = self.getNode('', start_state, root=True)

        for word in self.lex:
            #             print(f"Word: {word}, phones: {self.lex[word]}")
            self.insert(word)

        self.f.set_input_symbols(self.state_table)
        self.f.set_output_symbols(self.output_table)
Beispiel #8
0
def generate_phone_sequence_recognition_wfst(n, state_table, phone_table):
    """ generate a HMM to recognise any single phone sequence in the lexicon
    
    Args:
        n (int): states per phone HMM

    Returns:
        the constructed WFST
    
    """

    f = fst.Fst()

    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)

    phone_set = set()

    for pronunciation in lex.values():
        phone_set = phone_set.union(pronunciation)

    for phone in phone_set:
        current_state = f.add_state()
        f.add_arc(start_state, fst.Arc(0, 0, None, current_state))

        end_state = generate_phone_wfst(f, current_state, phone, n,
                                        state_table, phone_table)

        f.add_arc(end_state, fst.Arc(0, 0, None, start_state))
        f.set_final(end_state)

    return f
Beispiel #9
0
def generate_WFST_final_probability(n,
                                    lex,
                                    weight_fwd,
                                    weight_self,
                                    weights_final,
                                    original=False):
    """ generate a HMM to recognise any single word sequence for words in the lexicon
    
    Args:
        n (int): states per phone HMM
        original (bool): True/False - origianl/optimized lexicon
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
        weight_final (dict): word -> probability of final state
    Returns:
        the constructed WFST
    
    """

    f = fst.Fst('log')
    none_weight = fst.Weight('log', -math.log(1))

    lex = parse_lexicon(lex, original)

    word_table, phone_table, state_table = generate_symbols_table(lex, 3)
    output_table = generate_output_table(word_table, phone_table)

    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)

    for word, phone_list in lex.items():
        for phones in phone_list:
            initial_state = f.add_state()
            f.add_arc(
                start_state,
                fst.Arc(0, output_table.find(word), none_weight,
                        initial_state))
            current_state = initial_state

            for phone in phones:
                current_state = generate_phone_wfst(f, current_state, phone, n,
                                                    state_table, output_table,
                                                    weight_fwd, weight_self)

            f.set_final(current_state)
            f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))

        # final word state should be current state
        prob = weights_final[word]
        weight = fst.Weight('log', -math.log(prob))
        f.set_final(current_state, weight)


#         print(f"Current state: {current_state} for word {word} is prob {prob} with log prob{(weight)}")

    f.set_input_symbols(state_table)
    f.set_output_symbols(output_table)
    return f, word_table
 def __init__(self):
     self.lexicons = []
     self.disambig_graphemes = OrderedDict()
     self.words = OrderedDict()
     self.max_disambig = 0
     self.lexicon_fst = fst.Fst()
     self.start = 0
     self.last_s = 2
def build_lm(dev_fname, isyms_fname, constraints, lattice_output, refiner_fname):
    """
    Make a lattice that maps
    lemmas and constraints (or priors) to 
    an inflected version
    """
    # rewrite constraints
    constraints = constraints.replace("_",";")
    
    # read isyms
    input_syms = fst.SymbolTable.read_text(isyms_fname)
    s_fin = '</s>'
    code = {}
    for ltr, c in input_syms:
        code[c]=ltr

    # init the lattice
    f_big = fst.Fst()
    f_big.set_input_symbols(input_syms)
    f_big.set_output_symbols(input_syms)

    for line in open(dev_fname,'r').readlines():
        cns, lemma, inflection = line.split()[-3:]
        if cns == constraints:
            print(cns, lemma, inflection)
            # find idx that the strings diverge
            idx = 0
            for i, (lm, flc) in enumerate(zip(lemma, inflection)):
                if lm !=flc:
                    idx = i
                    break
            f, old= create_priors(cns, input_syms, input_syms, code)
            keep = old
            for j in range(idx,len(lemma)):            
                new = f.add_state()
                f.add_arc(old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new))
                old = new
            new = f.add_state()
            # the residual of the lemma is mapped to the inflection residual (indirectly)
            sym = lemma[idx:]+"_"+inflection[idx:]
            print(lemma, inflection, sym)
            f.add_arc(old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new))
            f.set_final(new)
            f_big.union(f)
            f_big = fst.determinize(f_big.rmepsilon())

    # add <sigma> state in the <sigma place holder>
    for c, ltr in code.items():
        if int(ltr)>1 and int(ltr)<36: # (hard coded) symbols of Runssian + 2 more
            f_big.add_arc(keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep))

    f_big.invert()
    # save lattice
    f_big.write(lattice_output)
Beispiel #12
0
 def test_simple(self):
     f = fst.Fst()
     s0 = f.add_state()
     s1 = f.add_state()
     s2 = f.add_state()
     f.add_arc(s0, fst.Arc(1, 1, fst.Weight(f.weight_type(), 3.0), s1))
     f.add_arc(s0, fst.Arc(1, 1, fst.Weight.One(f.weight_type()), s2))
     f.set_start(s0)
     f.set_final(s2, fst.Weight(f.weight_type(), 1.5))
     # Test fst
     self.assertEqual(f.num_states(), 3)
     self.assertAlmostEqual(float(f.final(s2)), 1.5)
 def __init__(self):
     self.unigram2state = {}
     self.bigram2state = {}
     self.grammar_fst = fst.Fst()
     self.order = 0
     self.grammar_fst.add_state()
     self.grammar_fst.set_start(0)
     self.unigram2state['<start>'] = 0
     self.grammar_fst.add_state()
     self.grammar_fst.set_start(1)
     self.unigram2state['<s>'] = 1
     self.disambig_symbol = '#0'
     self.words_table = {}
Beispiel #14
0
def OpenFST_Automata_Example():
    f = fst.Fst()
    s0 = f.add_state()
    s1 = f.add_state()
    s2 = f.add_state()
    f.add_arc(s0, fst.Arc(1, 2, fst.Weight(f.weight_type(), 3.0), s1))
    f.add_arc(s0, fst.Arc(1, 3, fst.Weight.One(f.weight_type()), s2))
    f.add_arc(s1, fst.Arc(2, 1, fst.Weight(f.weight_type(), 1.0), s2))
    f.set_start(s0)
    f.set_final(s2, fst.Weight(f.weight_type(), 1.5))

    print(s0, s1, s2)
    print(f)
Beispiel #15
0
    def make_query(self, cns, lemma):

        cns = cns.split(";")
        lemma = list(lemma)
        q = cns + ["<sigma>"] + lemma + ["</s>"]
        f = fst.Fst()
        f.set_input_symbols(self.input_syms)
        f.set_output_symbols(self.input_syms)
        s0 = f.add_state()
        f.set_start(s0)
        old = s0
        for j in range(len(q)):
            new = f.add_state()
            f.add_arc(old, fst.Arc(self.code[q[j]], self.code[q[j]], fst.Weight(f.weight_type(), 1.0), new))
            old = new
        f.set_final(old)
        return f
Beispiel #16
0
    def from_vocab(cls, vocab, tokenizer):
        fst = openfst.Fst()

        def add_word(word):
            i_words = tokenizer.token2idx(word) + [tokenizer.space_idx]
            if not fst.num_states():
                initial_state = fst.add_state()
                assert initial_state == 0
                fst.set_start(initial_state)

            source_state = fst.start()
            dest_state = None
            for i in i_words:
                # The initial state of FST is state 0, hence the index of chars in
                # the FST should start from 1 to avoid the conflict with the initial
                # state, otherwise wrong decoding results would be given.
                i += 1

                dest_state = fst.add_state()
                fst.add_arc(source_state, openfst.Arc(i, i, 0, dest_state))
                source_state = dest_state

            fst.set_final(dest_state, openfst.Weight.One('tropical'))

        lexicon_size = 0
        for word in vocab:
            add_word(word)
            lexicon_size += 1

        # This gets rid of "epsilon" transitions in the FST.
        # These are transitions that don't require a string input to be taken.
        # Getting rid of them is necessary to make the FST determinisitc, but
        # can greatly increase the size of the FST
        fst.rmepsilon()

        # This makes the FST deterministic, meaning for any string input there's
        # only one possible state the FST could be in.  It is assumed our
        # dictionary is deterministic when using it.
        # (lest we'd have to check for multiple transitions at each state)
        fst = openfst.determinize(fst)

        # Finds the simplest equivalent fst. This is unnecessary but decreases
        # memory usage of the dictionary
        fst.minimize()

        return cls(fst_path=None, fst=fst)
    def __init__(self, eps='<eps>', sb='<s>', se='</s>', ds='#0'):
        self.gram2state = {}
        self.grammar_fst = fst.Fst()
        self.order = 0
        self.eps = eps
        self.sb = sb
        self.se = se
        self.disambig_symbol = ds

        self.grammar_fst.add_state()
        self.gram2state[self.eps] = 0  # 0->-1
        self.grammar_fst.set_start(0)  # 0->-1

        self.grammar_fst.add_state()
        self.gram2state[self.sb] = 1
        self.grammar_fst.set_start(1)

        self.words_table = {}
        self.max_order = 0
Beispiel #18
0
def make_input_fst(query, pysym):
    f = fst.Fst()
    start = f.add_state()
    end = f.add_state()
    f.set_start(start)
    f.set_final(end, fst.Weight(f.weight_type(), 0.0))
    prev_state = start
    for ch in query:
        n = f.add_state()
        label = pysym[ch]
        f.add_arc(prev_state,
                  fst.Arc(label, label, fst.Weight(f.weight_type(), 0.0), n))
        prev_state = n
    f.add_arc(
        prev_state,
        fst.Arc(pysym['<eps>'], pysym['<eps>'],
                fst.Weight(f.weight_type(), 0.0), end))
    f.write('input.fst')
    return f
Beispiel #19
0
def make_fst(word_sym, phone_sym, pydict_file):
    with open(pydict_file, 'r') as rp:
        f = fst.Fst()
        start = f.add_state()
        end = f.add_state()
        f.set_start(start)
        f.add_arc(start,
                  fst.Arc(phone_sym['<eps>'], word_sym['<s>'],
                          fst.Weight(f.weight_type(), 0.0), start))  # 自转
        f.add_arc(end,
                  fst.Arc(phone_sym['<eps>'], word_sym['</s>'],
                          fst.Weight(f.weight_type(), 0.0), end))  # 自转
        f.add_arc(end,
                  fst.Arc(phone_sym['<eps>'], word_sym['<eps>'],
                          fst.Weight(f.weight_type(), 0.0), start))  # 1 --> 0
        f.set_final(end, fst.Weight(f.weight_type(), 0.0))
        for l in rp.readlines():
            items = l.strip().split(' ')
            prev_state = start
            ilabel = phone_sym['<eps>']
            olabel = word_sym['<eps>']
            for i in range(len(items[0])):
                n = f.add_state()
                pych = items[0][i]
                chch = items[1]
                ilabel = phone_sym[pych]
                if (i == 0):
                    olabel = word_sym[chch]
                else:
                    olabel = word_sym['<eps>']
                f.add_arc(
                    prev_state,
                    fst.Arc(ilabel, olabel, fst.Weight(f.weight_type(), 0.0),
                            n))
                prev_state = n
            # connect the last state with end node
            f.add_arc(
                prev_state,
                fst.Arc(phone_sym['<eps>'], olabel,
                        fst.Weight(f.weight_type(), 0.0), end))
        return f
Beispiel #20
0
def generate_phone_recognition_wfst(n, state_table, phone_table):
    """ generate a HMM to recognise any single phone in the lexicon
    
    Args:
        n (int): states per phone HMM

    Returns:
        the constructed WFST
    
    """

    f = fst.Fst()

    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)

    # get a list of all the phones in the lexicon
    # there are lots of way to do this.  Here, we use the set() object

    # will contain all unique phones in the lexicon
    phone_set = set()

    for pronunciation in lex.values():
        phone_set = phone_set.union(pronunciation)

    for phone in phone_set:

        # we need to add an empty arc from the start state to where the actual phone HMM
        # will begin.  If you can't see why this is needed, try without it!
        current_state = f.add_state()
        f.add_arc(start_state, fst.Arc(0, 0, None, current_state))

        end_state = generate_phone_wfst(f, current_state, phone, n,
                                        state_table, phone_table)

        f.set_final(end_state)

    return f
Beispiel #21
0
def build_chain_fst(labels, arc_type='log', vocab=None):
    """
    Build an acceptor for string given by elements of labels.
    Args:
        labels - a sequence of labels in the range 1..S
        arc_type - fst arc type (standard or log)
    Returns:
        FST consuming symbols in the range 1..S.
    Notes:
        Elements of labels are assumed to be greater than zero
        (which maps to blank)!
    """
    C = fst.Fst(arc_type=arc_type)
    weight_one = fst.Weight.One(C.weight_type())
    s = C.add_state()
    C.set_start(s)
    for l in labels:
        s_next = C.add_state()
        C.add_arc(s, fst.Arc(l, l, weight_one, s_next))
        s = s_next
    C.set_final(s)
    C.arcsort('ilabel')
    return C
Beispiel #22
0
def SimpleAutomata(ref_string, levenshtein_distance):
    automate = fst.Fst()
    states_dict = {}
    final_dst_state_label = str(
        len(ref_string)) + ";" + str(levenshtein_distance)

    init_state_index = get_index('0;0', automate, states_dict)

    for consummed_char_number in range(len(ref_string) + 1):
        for operations_number in range(levenshtein_distance + 1):
            src_state_label = str(consummed_char_number) + ";" + str(
                operations_number)
            # print(str(consummed_char_number != len(ref_string)) + "-" + str(operations_number == levenshtein_distance))
            print(
                str(consummed_char_number == len(ref_string)) + "-" +
                str(operations_number == levenshtein_distance))

            if (consummed_char_number == (len(ref_string))
                    and operations_number == levenshtein_distance):
                final_dst_state_label = src_state_label
                print("output state")
            elif (consummed_char_number == (len(ref_string))
                  and operations_number != levenshtein_distance):
                insertion_dst_state_label = str(
                    consummed_char_number) + ";" + str(operations_number + 1)
                insertion_arc_label = "*:epsilon:1"
                add_arc_to_automate(src_state_label, insertion_dst_state_label,
                                    insertion_arc_label, automate, states_dict)
            elif (consummed_char_number != (len(ref_string))
                  and operations_number == levenshtein_distance):
                accepting_dst_state_label = str(
                    consummed_char_number + 1) + ";" + str(operations_number)
                print(accepting_dst_state_label)
                accepting_arc_label = ref_string[
                    consummed_char_number] + ":" + ref_string[
                        consummed_char_number] + ":" + str(0)
                add_arc_to_automate(src_state_label, accepting_dst_state_label,
                                    accepting_arc_label, automate, states_dict)
            else:
                accepting_dst_state_label = str(
                    consummed_char_number + 1) + ";" + str(operations_number)
                accepting_arc_label = ref_string[
                    consummed_char_number] + ":" + ref_string[
                        consummed_char_number] + ":" + str(0)
                add_arc_to_automate(src_state_label, accepting_dst_state_label,
                                    accepting_arc_label, automate, states_dict)

                deletion_dst_state_label = str(
                    consummed_char_number + 1) + ";" + str(operations_number +
                                                           1)
                deletion_arc_label = "epsilon:" + ref_string[
                    consummed_char_number] + ":" + str(1)
                add_arc_to_automate(src_state_label, deletion_dst_state_label,
                                    deletion_arc_label, automate, states_dict)

                substitution_dst_state_label = str(
                    consummed_char_number + 1) + ";" + str(operations_number +
                                                           1)
                substitution_arc_label = "*:" + ref_string[
                    consummed_char_number] + ":" + str(1)
                add_arc_to_automate(src_state_label,
                                    substitution_dst_state_label,
                                    substitution_arc_label, automate,
                                    states_dict)

                insertion_dst_state_label = str(
                    consummed_char_number) + ";" + str(operations_number + 1)
                insertion_arc_label = "*:" + ref_string[
                    consummed_char_number] + ":" + str(1)
                add_arc_to_automate(src_state_label, insertion_dst_state_label,
                                    insertion_arc_label, automate, states_dict)

    automate.set_start(init_state_index)
    automate.set_final(states_dict[final_dst_state_label],
                       fst.Weight(automate.weight_type(), 1.5))
    automate.draw("automata.dot")
    print(automate)
    return automate
def InitAutomata():
    global automate, states_dict, init_state_index
    automate = fst.Fst()
    states_dict = {}
    init_state_index = get_index('0;0', automate, states_dict)
    print(states_dict, init_state_index)
 def __init__(self):
     self.token_fst = fst.Fst()
     self.graphemes_table = OrderedDict()
Beispiel #25
0
def Levenshtein_Automata_Dico(ref_string, levenshtein_distance):
    # Creation des etats de l'automate
    dict_levenshtein_states = create_states_dico(ref_string,
                                                 levenshtein_distance)

    # Creation des arcs emergeants de chaque etat
    # Pour les poids on pose que : d = 0 si on consomme un caractere, et 1 si on consomme etoile ou epsilon (insertion, deletion, substitution)
    # Pour les caracteres consommes et emis, on considere que les caracteres de la chaine de reference sont les caracteres consommes et les caracteres de la chaine hypothese seront les caracteres emis

    automate = fst.Fst()
    automata = {}
    weights = [0, 1, 1, 1]
    arcs_labels = []
    dst_states = []
    automata_voc = ["epsilon", "*"]
    automata_voc.extend(ref_string)

    initial_state_index = automate.add_state()  # label2int("0;0", ref_string)
    final_state_index = automate.add_state()  # label2int("5;2", ref_string)
    automate.set_start(initial_state_index)
    automate.set_final(final_state_index,
                       fst.Weight(automate.weight_type(), 1.5))

    for state_label, state_index in dict_levenshtein_states.iteritems():
        nb_consummed_chars = int(
            state_label.split(";")[0])  # 1er caractere du label
        nb_elementary_operations = int(
            state_label.split(";")[1])  # 2nd caractere du label

        set_arcs = {}
        arcs_labels = []
        char_from_ref_str = ''
        if nb_consummed_chars == len(ref_string):
            char_from_ref_str = "epsilon"
        else:
            char_from_ref_str = ref_string[nb_consummed_chars]

        up_dst_label = str(nb_consummed_chars) + ";" + str(
            nb_elementary_operations + 1)
        up_dst_index = label2int(up_dst_label, ref_string)
        # print("up", up_dst_label)
        insertion_arc_label = "*" + ":" + "epsilon" + ":" + str(1)
        insertion_split = insertion_arc_label.split(":")
        insertion_consummed_char = convertSymToLabel(insertion_split[0])
        insertion_transmitted_char = convertSymToLabel(insertion_split[1])
        insertion_weight = convertSymToLabel(insertion_split[2])

        diag_dst_label = str(nb_consummed_chars +
                             1) + ";" + str(nb_elementary_operations + 1)
        diag_dst_index = label2int(diag_dst_label, ref_string)
        # print("diag", diag_dst_label)
        deletion_arc_label = "epsilon:" + char_from_ref_str + ":" + str(
            weights[1])
        deletion_split = deletion_arc_label.split(":")
        deletion_consummed_char = convertSymToLabel(deletion_split[0])
        deletion_transmitted_char = convertSymToLabel(deletion_split[1])
        deletion_weight = convertSymToLabel(deletion_split[2])

        substitution_arc_label = "*:" + char_from_ref_str + ":" + str(
            weights[1])
        substitution_split = substitution_arc_label.split(":")
        substitution_consummed_char = convertSymToLabel(substitution_split[0])
        substitution_transmitted_char = convertSymToLabel(
            substitution_split[1])
        substitution_weight = convertSymToLabel(substitution_split[2])

        right_dst_label = str(nb_consummed_chars +
                              1) + ";" + str(nb_elementary_operations)
        right_dst_index = label2int(right_dst_label, ref_string)
        # print("right", right_dst_label)
        accepting_arc_label = char_from_ref_str + ":" + char_from_ref_str + ":" + str(
            weights[0])
        accepting_split = accepting_arc_label.split(":")
        accepting_consummed_char = convertSymToLabel(accepting_split[0])
        accepting_transmitted_char = convertSymToLabel(accepting_split[1])
        accepting_weight = convertSymToLabel(accepting_split[2])

        is_last_column = nb_consummed_chars == len(
            ref_string
        )  # booleen renvoie true si le nombre de caracteres conssommes est egal a la longueur de la chaine et false sinon
        is_last_row = nb_elementary_operations == levenshtein_distance  # booleen renvoie true si le nombre d'operations elementaires est egal a la distance de levenshtein et false sinon
        if is_last_column and is_last_row:
            output_arc_label = "epsilon" + ":" + "epsilon" + ":" + str(0)
            set_arcs[output_arc_label] = []
        elif is_last_column:
            arcs_labels.append(insertion_arc_label)
            dst_states.append(up_dst_label)

            set_arcs[insertion_arc_label] = [up_dst_label]
            automate.add_arc(
                state_index,
                fst.Arc(insertion_consummed_char, insertion_transmitted_char,
                        fst.Weight(automate.weight_type(), insertion_weight),
                        up_dst_index))

        elif is_last_row:
            arcs_labels.append(accepting_arc_label)
            dst_states.append(right_dst_label)
            set_arcs[accepting_arc_label] = [right_dst_label]

            automate.add_arc(
                state_index,
                fst.Arc(accepting_consummed_char, accepting_transmitted_char,
                        fst.Weight(automate.weight_type(), accepting_weight),
                        right_dst_index))

        else:
            arcs_labels.append(accepting_arc_label)
            dst_states.append(right_dst_label)
            set_arcs[accepting_arc_label] = [right_dst_label]
            automate.add_arc(
                state_index,
                fst.Arc(accepting_consummed_char, accepting_transmitted_char,
                        fst.Weight(automate.weight_type(), accepting_weight),
                        right_dst_index))

            arcs_labels.append(deletion_arc_label)
            dst_states.append(diag_dst_label)
            set_arcs[deletion_arc_label] = [diag_dst_label]
            automate.add_arc(
                state_index,
                fst.Arc(deletion_consummed_char, deletion_transmitted_char,
                        fst.Weight(automate.weight_type(), deletion_weight),
                        diag_dst_index))

            arcs_labels.append(substitution_arc_label)
            dst_states.append(diag_dst_label)
            automate.add_arc(
                state_index,
                fst.Arc(
                    substitution_consummed_char, substitution_transmitted_char,
                    fst.Weight(automate.weight_type(), substitution_weight),
                    diag_dst_index))

            arcs_labels.append(insertion_arc_label)
            dst_states.append(up_dst_label)
            automate.add_arc(
                state_index,
                fst.Arc(insertion_consummed_char, insertion_transmitted_char,
                        fst.Weight(automate.weight_type(), insertion_weight),
                        up_dst_index))

            set_arcs[substitution_arc_label] = [
                diag_dst_label, up_dst_label
            ]  # insertion et substitution ont les memes labels d'arcs

        automata[state_label] = set_arcs

        # print(automata[state_label])

        # for idx in range(len(dst_states)):
        #     dst_state_label = dst_states[idx]
        #     dst_state_index = dict_levenshtein_states[dst_state_label]

        #     consummed_char = convertSymToLabel(char_from_ref_str)
        #     dst_states[idx]
        #     transmitted_char = info[1]
        #     weight = info[2]

        #     automate.add_arc(
        #             state_index,
        #             fst.Arc(
        #                 transmitted_char,
        #                 consummed_char,
        #                 fst.Weight(automate.weight_type(), weight),
        #                 dst_state_index)
        #         )

    print(automata)

    # Display Automata in LaTeX :

    return (automata)
Beispiel #26
0
def build_lm(dev_fname, isyms_fname, constraints, lattice_output):
    """
    Make a lattice that maps
    lemmas and constraints (or priors) to 
    an inflected version
    """
    # rewrite constraints
    constraints = constraints.replace("_", ";")

    # read isyms
    input_syms = fst.SymbolTable.read_text(isyms_fname)
    s_fin = '</s>'
    code = {}
    for ltr, c in input_syms:
        code[c] = ltr

    # init the lattice
    f_big = fst.Fst("log")
    f_big.set_input_symbols(input_syms)
    f_big.set_output_symbols(input_syms)

    for line in open(dev_fname, 'r').readlines(
    ):  # all possilbe inflections are added, regardless of the prior (applying the prior an make for a more effecifent computation)
        line = line.strip()
        lemma, inflection, cns = line.split("\t")[:-2]
        #print(lemma, inflection, cns)
        if cns == constraints:

            # comparing strings
            idx = 0
            lemma = lemma.split()
            inflection = inflection.split()
            for j, (lm, flc) in enumerate(zip(lemma, inflection)):
                if lm != flc:
                    idx = j
                    break

            f, old = create_priors(cns, input_syms, input_syms, code)
            keep = old
            for j in range(idx, len(lemma)):
                new = f.add_state()
                f.add_arc(
                    old,
                    fst.Arc(code[lemma[j]], code[lemma[j]],
                            fst.Weight(f.weight_type(), 1.0), new))
                old = new
            new = f.add_state()
            # the residual of the lemma is mapped to the inflection residual (indirectly)
            sym = "".join(lemma[idx:]) + "_" + "".join(inflection[idx:])
            f.add_arc(
                old,
                fst.Arc(code[sym], code[s_fin],
                        fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new))
            #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new))
            f.set_final(new)
            f_big.union(f)
            f_big = fst.determinize(f_big.rmepsilon())

    # add <sigma> state in the <sigma place holder>
    for c, ltr in code.items():
        if int(ltr) > 1 and int(
                ltr) < 51:  # (hard coded) symbols of Runssian + 2 more
            f_big.add_arc(
                keep,
                fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0),
                        keep))

    f_big.invert()
    # save lattice
    f_big.write(lattice_output)
Beispiel #27
0
def generate_word_sequence_recognition_wfst_bigram(n,
                                                   lex,
                                                   df_bigram_prob,
                                                   original=False,
                                                   weight_fwd=None,
                                                   weight_self=None):
    """ generate a HMM to recognise any single word sequence for words in the lexicon
    
    Args:
        n (int): states per phone HMM
        original (bool): True/False - origianl/optimized lexicon
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
    Returns:
        the constructed WFST
    
    """
    if (weight_fwd != None and weight_self != None):
        f = fst.Fst('log')
        none_weight = fst.Weight('log', -math.log(1))
    else:
        f = fst.Fst()
        none_weight = None
    lex = parse_lexicon(lex, original)
    word_table, phone_table, state_table = generate_symbols_table(lex, 3)
    output_table = generate_output_table(word_table, phone_table)
    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)
    # -- dictionaries for initial and last states
    dict_initial = {}
    dict_final = {}
    # make fst
    for word, phone_list in lex.items():
        for phones in phone_list:
            initial_state = f.add_state()
            # -- add to initial dict
            if word in dict_initial:
                dict_initial[word].append(initial_state)
            else:
                dict_initial[word] = [initial_state]
            # -- add arcs
            f.add_arc(
                start_state,
                fst.Arc(0, output_table.find(word), none_weight,
                        initial_state))
            current_state = initial_state
            for phone in phones:
                current_state = generate_phone_wfst(f, current_state, phone, n,
                                                    state_table, output_table,
                                                    weight_fwd, weight_self)
            f.set_final(current_state)
            f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))
            # -- add to final dict
            if word in dict_final:
                dict_final[word].append(current_state)
            else:
                dict_final[word] = [current_state]
    # -- add bidirectional arcs
    for word, last_state_list in dict_final.items(
    ):  # list of final states 4 word
        for last_state in last_state_list:  # final state from lsit
            for word_bi, initial_state_list in dict_initial.items(
            ):  # list of initial satates
                for initial_state in initial_state_list:  # state from list
                    prob = df_bigram_prob['Word After', word_bi]['Word Before',
                                                                 word]
                    if (prob == 0):
                        prob = 1e10
                    else:
                        prob = -math.log(prob)
                    weight = fst.Weight('log', prob)
                    f.add_arc(
                        last_state,
                        fst.Arc(0, output_table.find(word_bi), weight,
                                initial_state))

    f.set_input_symbols(state_table)
    f.set_output_symbols(output_table)
    return f, word_table
Beispiel #28
0
# -*- coding: utf-8 -*-
import bisect
import matplotlib.pyplot as plt
import numpy as np
import pyparsing
import graphviz
import dot2tex
import openfst_python as fst

automate = fst.Fst()


def SimpleAutomata():
    src_state_label = "0;0"
    src_state_index = automate.add_state()

    dst_state_label = "0;1"
    dst_state_index = automate.add_state()
    arc_label = "2:4:1"
    label_string = arc_label.split(":")
    consummed_char = 2  # int(label_string[0])
    transmitted_char = 4  # int(label_string[1])
    weight = 1  # int(label_string[2])

    automate.add_arc(
        src_state_index,
        fst.Arc(transmitted_char, consummed_char,
                fst.Weight(automate.weight_type(), weight), dst_state_index))

    print(automate)
Beispiel #29
0
def generate_WFST_silent(n, lex, weight_fwd, weight_self, original=False):
    """ generate a HMM to recognise any single word sequence for words in the lexicon and includes a silence state
    
    Args:
        n (int): states per phone HMM
        original (bool): True/False - origianl/optimized lexicon
        weight_fwd (int): weight value
        weight_self (int): weight value of self node
        weights_start (dict): word -> probability of word
    Returns:
        the constructed WFST
    
    """

    f = fst.Fst('log')
    none_weight = fst.Weight('log', -math.log(1))

    original_lex = parse_lexicon(lex, original)
    # add the silent states
    silent_word = '<silence>'
    silent_phones = ['sil_0', 'sil_1', 'sil_2', 'sil_3', 'sil_4', 'sil_5']
    silence_lex = original_lex.copy()
    silence_lex[silent_word] = [silent_phones
                                ]  # makes sure output table contains it
    # -----
    #     print(f"lex: {silence_lex}")
    word_table, phone_table, state_table = generate_symbols_table(
        original_lex, 3)
    word_table.add_symbol(silent_word)
    for phone in silent_phones:
        state_table.add_symbol(phone)
    phone_table.add_symbol('sil')

    #     print(f'state table: {list(state_table)}')
    output_table = generate_output_table(word_table, phone_table)

    # create a single start state
    start_state = f.add_state()
    f.set_start(start_state)

    # skip silent phones by using original lex
    for word, phone_list in original_lex.items():
        for phones in phone_list:
            initial_state = f.add_state()
            f.add_arc(
                start_state,
                fst.Arc(0, output_table.find(word), none_weight,
                        initial_state))
            current_state = initial_state

            for phone in phones:
                current_state = generate_phone_wfst(f, current_state, phone, n,
                                                    state_table, output_table,
                                                    weight_fwd, weight_self)

            f.set_final(current_state)
            f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))

    # need to add the silent state seperately
    current_state = f.add_state()
    f.add_arc(
        start_state,
        fst.Arc(0, output_table.find(silent_word), none_weight, current_state))
    current_state = generate_silent_phone_wfst(f, current_state, state_table,
                                               output_table)
    f.set_final(current_state)
    f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state))

    f.set_input_symbols(state_table)
    f.set_output_symbols(output_table)
    return f, word_table
Beispiel #30
0
# -*- coding: utf-8 -*-
import bisect
import matplotlib.pyplot as plt
import numpy as np
import pyparsing
import graphviz
import dot2tex
import openfst_python as fst


def printTxt(txt):
    print(txt)


automate = fst.Fst()  # creation de l'automate
automate_states = {
}  # dictionnaire contenant tous les etats de l'automate : les cles sont les labels des etats et les valeurs sont les etats crees


# La fonction add_automate_state est appelee lors du parcours de l'automate et permet de creer un dict contenant tous les etats de l'automate une et une fois seulement
# Si le label fait deja partie du dictionnaire alors, elle ne fait rien, sinon elle ajoute le nouveau label et cree l'etat correspondant avec add_state()
def add_automate_state(state_label, state_index):
    if (state_label not in automate_states):
        automate_states[state_label] = [automate.add_state(), state_index]
        state_index += 1
    return state_index


def create_states_dico(ref_string, levenshtein_distance):
    dict_levenshtein_states = {}
    for column in range(len(ref_string) + 1):