def generate_word_sequence_recognition_wfst_test(n, lex, original=False, weight_fwd=None, weight_self=None): """ generate a HMM to recognise any single word sequence for words in the lexicon Args: n (int): states per phone HMM original (bool): True/False - origianl/optimized lexicon weight_fwd (int): weight value weight_self (int): weight value of self node Returns: the constructed WFST """ if (weight_fwd != None and weight_self != None): f = fst.Fst('log') none_weight = fst.Weight('log', -math.log(1)) else: f = fst.Fst() none_weight = None lex = parse_lexicon(lex, original) word_table, phone_table, state_table = generate_symbols_table(lex, 3) output_table = generate_output_table(word_table, phone_table) # print('output_table: {}'.format(list(output_table))) # create a single start state start_state = f.add_state() f.set_start(start_state) # -- make fst for word, phone_list in lex.items(): for phones in phone_list: initial_state = f.add_state() f.add_arc( start_state, fst.Arc(0, output_table.find(word), none_weight, initial_state)) current_state = initial_state for phone in phones: current_state = generate_phone_wfst(f, current_state, phone, n, state_table, output_table, weight_fwd, weight_self) f.set_final(current_state) # f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) f.set_input_symbols(state_table) f.set_output_symbols(output_table) return f, word_table
def create_priors(priors, isym, osym, code): """This function creates a linear FST and adds a <sigma> (joker) symbol at the end as a place holder""" priors = priors.split(";") # init a trasducer f = fst.Fst() f.set_input_symbols(isym) f.set_output_symbols(osym) s0 = f.add_state() f.set_start(s0) old = s0 sig = "<sigma>" # adding priors for j in range(len(priors)): new = f.add_state() f.add_arc(old, fst.Arc(code[priors[j]], code[priors[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new new = f.add_state() # adding <sigma> f.add_arc(old, fst.Arc(code[sig], code[sig], fst.Weight(f.weight_type(), 1.0), new)) f.add_arc(new, fst.Arc(code[sig], code[sig], fst.Weight(f.weight_type(), 1.0), new)) return f,new
def OpenFST_Automata_Test(set_src_states, set_dst_states, set_labels): f = fst.Fst() for i, src in set_src_states: for j, label in set_labels[src]: for k, dst in set_dst_states[src][labels]: print(src, label, dst)
def build_refiner(isyms_fname, refiner_fname): """build refiner this fst would help extract the last two states (one last arc) of the machine """ # read isyms input_syms = fst.SymbolTable.read_text(isyms_fname) code = {} for ltr, c in input_syms: code[c]=ltr # build refiner refiner = fst.Fst() refiner.set_input_symbols(input_syms) refiner.set_output_symbols(input_syms) s0 = refiner.add_state() s1 = refiner.add_state() for c, ltr in code.items(): if ltr == 0: continue if ltr < 100: refiner.add_arc(s0, fst.Arc(code[c], code["<epsilon>"], fst.Weight(refiner.weight_type(), 1.0), s0)) refiner.add_arc(s0, fst.Arc(code[c], code[c], fst.Weight(refiner.weight_type(), 1.0), s1)) refiner.set_start(s0) refiner.set_final(s1) # save refiner refiner.write(refiner_fname)
def generate_word_sequence_recognition_wfst(n): """ generate a HMM to recognise any single word sequence for words in the lexicon Args: n (int): states per phone HMM Returns: the constructed WFST """ f = fst.Fst('log') # create a single start state start_state = f.add_state() f.set_start(start_state) for _, word in word_table: if word != '<eps>': tmp_state = f.add_state() weight = fst.Weight('log', -math.log(word_table.num_symbols())) f.add_arc(start_state, fst.Arc(0, 0, weight, tmp_state)) word_wfst = generate_word_wfst(f, tmp_state, word, n) weight = fst.Weight('log', -math.log(1.0)) f.add_arc(list(word_wfst.states())[-1], fst.Arc(0, 0, weight, start_state)) return f
def generate_phone_sequence_recognition_wfst(n): """ generate a HMM to recognise any single phone sequence in the lexicon Args: n (int): states per phone HMM Returns: the constructed WFST """ f = fst.Fst('log') # create a single start state start_state = f.add_state() f.set_start(start_state) for i, phone in phone_table: if phone != '<eps>': tmp_state = f.add_state() weight = fst.Weight('log', -math.log(phone_table.num_symbols())) f.add_arc(start_state, fst.Arc(0, 0, weight, tmp_state)) last_state = generate_phone_wfst(f, tmp_state, phone, n) f.set_final(last_state) weight = fst.Weight('log', -math.log(1)) f.add_arc(last_state, fst.Arc(0, 0, weight, start_state)) return f
def __init__(self, lex, original=True): self.lex = parse_lexicon(lex, False) self.word_table, self.phone_table, self.state_table = generate_symbols_table( self.lex, 3) self.output_table = generate_output_table(self.word_table, self.phone_table) # print(f"phone table: {list(self.phone_table)}") # print(f"Output stable: {list(self.output_table)}") # print(f"Word stable: {list(self.word_table)}") # print(f"State Table: {list(self.state_table)}") self.f = fst.Fst() start_state = self.f.add_state() self.f.set_start(start_state) self.root = self.getNode('', start_state, root=True) for word in self.lex: # print(f"Word: {word}, phones: {self.lex[word]}") self.insert(word) self.f.set_input_symbols(self.state_table) self.f.set_output_symbols(self.output_table)
def generate_phone_sequence_recognition_wfst(n, state_table, phone_table): """ generate a HMM to recognise any single phone sequence in the lexicon Args: n (int): states per phone HMM Returns: the constructed WFST """ f = fst.Fst() # create a single start state start_state = f.add_state() f.set_start(start_state) phone_set = set() for pronunciation in lex.values(): phone_set = phone_set.union(pronunciation) for phone in phone_set: current_state = f.add_state() f.add_arc(start_state, fst.Arc(0, 0, None, current_state)) end_state = generate_phone_wfst(f, current_state, phone, n, state_table, phone_table) f.add_arc(end_state, fst.Arc(0, 0, None, start_state)) f.set_final(end_state) return f
def generate_WFST_final_probability(n, lex, weight_fwd, weight_self, weights_final, original=False): """ generate a HMM to recognise any single word sequence for words in the lexicon Args: n (int): states per phone HMM original (bool): True/False - origianl/optimized lexicon weight_fwd (int): weight value weight_self (int): weight value of self node weight_final (dict): word -> probability of final state Returns: the constructed WFST """ f = fst.Fst('log') none_weight = fst.Weight('log', -math.log(1)) lex = parse_lexicon(lex, original) word_table, phone_table, state_table = generate_symbols_table(lex, 3) output_table = generate_output_table(word_table, phone_table) # create a single start state start_state = f.add_state() f.set_start(start_state) for word, phone_list in lex.items(): for phones in phone_list: initial_state = f.add_state() f.add_arc( start_state, fst.Arc(0, output_table.find(word), none_weight, initial_state)) current_state = initial_state for phone in phones: current_state = generate_phone_wfst(f, current_state, phone, n, state_table, output_table, weight_fwd, weight_self) f.set_final(current_state) f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) # final word state should be current state prob = weights_final[word] weight = fst.Weight('log', -math.log(prob)) f.set_final(current_state, weight) # print(f"Current state: {current_state} for word {word} is prob {prob} with log prob{(weight)}") f.set_input_symbols(state_table) f.set_output_symbols(output_table) return f, word_table
def __init__(self): self.lexicons = [] self.disambig_graphemes = OrderedDict() self.words = OrderedDict() self.max_disambig = 0 self.lexicon_fst = fst.Fst() self.start = 0 self.last_s = 2
def build_lm(dev_fname, isyms_fname, constraints, lattice_output, refiner_fname): """ Make a lattice that maps lemmas and constraints (or priors) to an inflected version """ # rewrite constraints constraints = constraints.replace("_",";") # read isyms input_syms = fst.SymbolTable.read_text(isyms_fname) s_fin = '</s>' code = {} for ltr, c in input_syms: code[c]=ltr # init the lattice f_big = fst.Fst() f_big.set_input_symbols(input_syms) f_big.set_output_symbols(input_syms) for line in open(dev_fname,'r').readlines(): cns, lemma, inflection = line.split()[-3:] if cns == constraints: print(cns, lemma, inflection) # find idx that the strings diverge idx = 0 for i, (lm, flc) in enumerate(zip(lemma, inflection)): if lm !=flc: idx = i break f, old= create_priors(cns, input_syms, input_syms, code) keep = old for j in range(idx,len(lemma)): new = f.add_state() f.add_arc(old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new new = f.add_state() # the residual of the lemma is mapped to the inflection residual (indirectly) sym = lemma[idx:]+"_"+inflection[idx:] print(lemma, inflection, sym) f.add_arc(old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new)) f.set_final(new) f_big.union(f) f_big = fst.determinize(f_big.rmepsilon()) # add <sigma> state in the <sigma place holder> for c, ltr in code.items(): if int(ltr)>1 and int(ltr)<36: # (hard coded) symbols of Runssian + 2 more f_big.add_arc(keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep)) f_big.invert() # save lattice f_big.write(lattice_output)
def test_simple(self): f = fst.Fst() s0 = f.add_state() s1 = f.add_state() s2 = f.add_state() f.add_arc(s0, fst.Arc(1, 1, fst.Weight(f.weight_type(), 3.0), s1)) f.add_arc(s0, fst.Arc(1, 1, fst.Weight.One(f.weight_type()), s2)) f.set_start(s0) f.set_final(s2, fst.Weight(f.weight_type(), 1.5)) # Test fst self.assertEqual(f.num_states(), 3) self.assertAlmostEqual(float(f.final(s2)), 1.5)
def __init__(self): self.unigram2state = {} self.bigram2state = {} self.grammar_fst = fst.Fst() self.order = 0 self.grammar_fst.add_state() self.grammar_fst.set_start(0) self.unigram2state['<start>'] = 0 self.grammar_fst.add_state() self.grammar_fst.set_start(1) self.unigram2state['<s>'] = 1 self.disambig_symbol = '#0' self.words_table = {}
def OpenFST_Automata_Example(): f = fst.Fst() s0 = f.add_state() s1 = f.add_state() s2 = f.add_state() f.add_arc(s0, fst.Arc(1, 2, fst.Weight(f.weight_type(), 3.0), s1)) f.add_arc(s0, fst.Arc(1, 3, fst.Weight.One(f.weight_type()), s2)) f.add_arc(s1, fst.Arc(2, 1, fst.Weight(f.weight_type(), 1.0), s2)) f.set_start(s0) f.set_final(s2, fst.Weight(f.weight_type(), 1.5)) print(s0, s1, s2) print(f)
def make_query(self, cns, lemma): cns = cns.split(";") lemma = list(lemma) q = cns + ["<sigma>"] + lemma + ["</s>"] f = fst.Fst() f.set_input_symbols(self.input_syms) f.set_output_symbols(self.input_syms) s0 = f.add_state() f.set_start(s0) old = s0 for j in range(len(q)): new = f.add_state() f.add_arc(old, fst.Arc(self.code[q[j]], self.code[q[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new f.set_final(old) return f
def from_vocab(cls, vocab, tokenizer): fst = openfst.Fst() def add_word(word): i_words = tokenizer.token2idx(word) + [tokenizer.space_idx] if not fst.num_states(): initial_state = fst.add_state() assert initial_state == 0 fst.set_start(initial_state) source_state = fst.start() dest_state = None for i in i_words: # The initial state of FST is state 0, hence the index of chars in # the FST should start from 1 to avoid the conflict with the initial # state, otherwise wrong decoding results would be given. i += 1 dest_state = fst.add_state() fst.add_arc(source_state, openfst.Arc(i, i, 0, dest_state)) source_state = dest_state fst.set_final(dest_state, openfst.Weight.One('tropical')) lexicon_size = 0 for word in vocab: add_word(word) lexicon_size += 1 # This gets rid of "epsilon" transitions in the FST. # These are transitions that don't require a string input to be taken. # Getting rid of them is necessary to make the FST determinisitc, but # can greatly increase the size of the FST fst.rmepsilon() # This makes the FST deterministic, meaning for any string input there's # only one possible state the FST could be in. It is assumed our # dictionary is deterministic when using it. # (lest we'd have to check for multiple transitions at each state) fst = openfst.determinize(fst) # Finds the simplest equivalent fst. This is unnecessary but decreases # memory usage of the dictionary fst.minimize() return cls(fst_path=None, fst=fst)
def __init__(self, eps='<eps>', sb='<s>', se='</s>', ds='#0'): self.gram2state = {} self.grammar_fst = fst.Fst() self.order = 0 self.eps = eps self.sb = sb self.se = se self.disambig_symbol = ds self.grammar_fst.add_state() self.gram2state[self.eps] = 0 # 0->-1 self.grammar_fst.set_start(0) # 0->-1 self.grammar_fst.add_state() self.gram2state[self.sb] = 1 self.grammar_fst.set_start(1) self.words_table = {} self.max_order = 0
def make_input_fst(query, pysym): f = fst.Fst() start = f.add_state() end = f.add_state() f.set_start(start) f.set_final(end, fst.Weight(f.weight_type(), 0.0)) prev_state = start for ch in query: n = f.add_state() label = pysym[ch] f.add_arc(prev_state, fst.Arc(label, label, fst.Weight(f.weight_type(), 0.0), n)) prev_state = n f.add_arc( prev_state, fst.Arc(pysym['<eps>'], pysym['<eps>'], fst.Weight(f.weight_type(), 0.0), end)) f.write('input.fst') return f
def make_fst(word_sym, phone_sym, pydict_file): with open(pydict_file, 'r') as rp: f = fst.Fst() start = f.add_state() end = f.add_state() f.set_start(start) f.add_arc(start, fst.Arc(phone_sym['<eps>'], word_sym['<s>'], fst.Weight(f.weight_type(), 0.0), start)) # 自转 f.add_arc(end, fst.Arc(phone_sym['<eps>'], word_sym['</s>'], fst.Weight(f.weight_type(), 0.0), end)) # 自转 f.add_arc(end, fst.Arc(phone_sym['<eps>'], word_sym['<eps>'], fst.Weight(f.weight_type(), 0.0), start)) # 1 --> 0 f.set_final(end, fst.Weight(f.weight_type(), 0.0)) for l in rp.readlines(): items = l.strip().split(' ') prev_state = start ilabel = phone_sym['<eps>'] olabel = word_sym['<eps>'] for i in range(len(items[0])): n = f.add_state() pych = items[0][i] chch = items[1] ilabel = phone_sym[pych] if (i == 0): olabel = word_sym[chch] else: olabel = word_sym['<eps>'] f.add_arc( prev_state, fst.Arc(ilabel, olabel, fst.Weight(f.weight_type(), 0.0), n)) prev_state = n # connect the last state with end node f.add_arc( prev_state, fst.Arc(phone_sym['<eps>'], olabel, fst.Weight(f.weight_type(), 0.0), end)) return f
def generate_phone_recognition_wfst(n, state_table, phone_table): """ generate a HMM to recognise any single phone in the lexicon Args: n (int): states per phone HMM Returns: the constructed WFST """ f = fst.Fst() # create a single start state start_state = f.add_state() f.set_start(start_state) # get a list of all the phones in the lexicon # there are lots of way to do this. Here, we use the set() object # will contain all unique phones in the lexicon phone_set = set() for pronunciation in lex.values(): phone_set = phone_set.union(pronunciation) for phone in phone_set: # we need to add an empty arc from the start state to where the actual phone HMM # will begin. If you can't see why this is needed, try without it! current_state = f.add_state() f.add_arc(start_state, fst.Arc(0, 0, None, current_state)) end_state = generate_phone_wfst(f, current_state, phone, n, state_table, phone_table) f.set_final(end_state) return f
def build_chain_fst(labels, arc_type='log', vocab=None): """ Build an acceptor for string given by elements of labels. Args: labels - a sequence of labels in the range 1..S arc_type - fst arc type (standard or log) Returns: FST consuming symbols in the range 1..S. Notes: Elements of labels are assumed to be greater than zero (which maps to blank)! """ C = fst.Fst(arc_type=arc_type) weight_one = fst.Weight.One(C.weight_type()) s = C.add_state() C.set_start(s) for l in labels: s_next = C.add_state() C.add_arc(s, fst.Arc(l, l, weight_one, s_next)) s = s_next C.set_final(s) C.arcsort('ilabel') return C
def SimpleAutomata(ref_string, levenshtein_distance): automate = fst.Fst() states_dict = {} final_dst_state_label = str( len(ref_string)) + ";" + str(levenshtein_distance) init_state_index = get_index('0;0', automate, states_dict) for consummed_char_number in range(len(ref_string) + 1): for operations_number in range(levenshtein_distance + 1): src_state_label = str(consummed_char_number) + ";" + str( operations_number) # print(str(consummed_char_number != len(ref_string)) + "-" + str(operations_number == levenshtein_distance)) print( str(consummed_char_number == len(ref_string)) + "-" + str(operations_number == levenshtein_distance)) if (consummed_char_number == (len(ref_string)) and operations_number == levenshtein_distance): final_dst_state_label = src_state_label print("output state") elif (consummed_char_number == (len(ref_string)) and operations_number != levenshtein_distance): insertion_dst_state_label = str( consummed_char_number) + ";" + str(operations_number + 1) insertion_arc_label = "*:epsilon:1" add_arc_to_automate(src_state_label, insertion_dst_state_label, insertion_arc_label, automate, states_dict) elif (consummed_char_number != (len(ref_string)) and operations_number == levenshtein_distance): accepting_dst_state_label = str( consummed_char_number + 1) + ";" + str(operations_number) print(accepting_dst_state_label) accepting_arc_label = ref_string[ consummed_char_number] + ":" + ref_string[ consummed_char_number] + ":" + str(0) add_arc_to_automate(src_state_label, accepting_dst_state_label, accepting_arc_label, automate, states_dict) else: accepting_dst_state_label = str( consummed_char_number + 1) + ";" + str(operations_number) accepting_arc_label = ref_string[ consummed_char_number] + ":" + ref_string[ consummed_char_number] + ":" + str(0) add_arc_to_automate(src_state_label, accepting_dst_state_label, accepting_arc_label, automate, states_dict) deletion_dst_state_label = str( consummed_char_number + 1) + ";" + str(operations_number + 1) deletion_arc_label = "epsilon:" + ref_string[ consummed_char_number] + ":" + str(1) add_arc_to_automate(src_state_label, deletion_dst_state_label, deletion_arc_label, automate, states_dict) substitution_dst_state_label = str( consummed_char_number + 1) + ";" + str(operations_number + 1) substitution_arc_label = "*:" + ref_string[ consummed_char_number] + ":" + str(1) add_arc_to_automate(src_state_label, substitution_dst_state_label, substitution_arc_label, automate, states_dict) insertion_dst_state_label = str( consummed_char_number) + ";" + str(operations_number + 1) insertion_arc_label = "*:" + ref_string[ consummed_char_number] + ":" + str(1) add_arc_to_automate(src_state_label, insertion_dst_state_label, insertion_arc_label, automate, states_dict) automate.set_start(init_state_index) automate.set_final(states_dict[final_dst_state_label], fst.Weight(automate.weight_type(), 1.5)) automate.draw("automata.dot") print(automate) return automate
def InitAutomata(): global automate, states_dict, init_state_index automate = fst.Fst() states_dict = {} init_state_index = get_index('0;0', automate, states_dict) print(states_dict, init_state_index)
def __init__(self): self.token_fst = fst.Fst() self.graphemes_table = OrderedDict()
def Levenshtein_Automata_Dico(ref_string, levenshtein_distance): # Creation des etats de l'automate dict_levenshtein_states = create_states_dico(ref_string, levenshtein_distance) # Creation des arcs emergeants de chaque etat # Pour les poids on pose que : d = 0 si on consomme un caractere, et 1 si on consomme etoile ou epsilon (insertion, deletion, substitution) # Pour les caracteres consommes et emis, on considere que les caracteres de la chaine de reference sont les caracteres consommes et les caracteres de la chaine hypothese seront les caracteres emis automate = fst.Fst() automata = {} weights = [0, 1, 1, 1] arcs_labels = [] dst_states = [] automata_voc = ["epsilon", "*"] automata_voc.extend(ref_string) initial_state_index = automate.add_state() # label2int("0;0", ref_string) final_state_index = automate.add_state() # label2int("5;2", ref_string) automate.set_start(initial_state_index) automate.set_final(final_state_index, fst.Weight(automate.weight_type(), 1.5)) for state_label, state_index in dict_levenshtein_states.iteritems(): nb_consummed_chars = int( state_label.split(";")[0]) # 1er caractere du label nb_elementary_operations = int( state_label.split(";")[1]) # 2nd caractere du label set_arcs = {} arcs_labels = [] char_from_ref_str = '' if nb_consummed_chars == len(ref_string): char_from_ref_str = "epsilon" else: char_from_ref_str = ref_string[nb_consummed_chars] up_dst_label = str(nb_consummed_chars) + ";" + str( nb_elementary_operations + 1) up_dst_index = label2int(up_dst_label, ref_string) # print("up", up_dst_label) insertion_arc_label = "*" + ":" + "epsilon" + ":" + str(1) insertion_split = insertion_arc_label.split(":") insertion_consummed_char = convertSymToLabel(insertion_split[0]) insertion_transmitted_char = convertSymToLabel(insertion_split[1]) insertion_weight = convertSymToLabel(insertion_split[2]) diag_dst_label = str(nb_consummed_chars + 1) + ";" + str(nb_elementary_operations + 1) diag_dst_index = label2int(diag_dst_label, ref_string) # print("diag", diag_dst_label) deletion_arc_label = "epsilon:" + char_from_ref_str + ":" + str( weights[1]) deletion_split = deletion_arc_label.split(":") deletion_consummed_char = convertSymToLabel(deletion_split[0]) deletion_transmitted_char = convertSymToLabel(deletion_split[1]) deletion_weight = convertSymToLabel(deletion_split[2]) substitution_arc_label = "*:" + char_from_ref_str + ":" + str( weights[1]) substitution_split = substitution_arc_label.split(":") substitution_consummed_char = convertSymToLabel(substitution_split[0]) substitution_transmitted_char = convertSymToLabel( substitution_split[1]) substitution_weight = convertSymToLabel(substitution_split[2]) right_dst_label = str(nb_consummed_chars + 1) + ";" + str(nb_elementary_operations) right_dst_index = label2int(right_dst_label, ref_string) # print("right", right_dst_label) accepting_arc_label = char_from_ref_str + ":" + char_from_ref_str + ":" + str( weights[0]) accepting_split = accepting_arc_label.split(":") accepting_consummed_char = convertSymToLabel(accepting_split[0]) accepting_transmitted_char = convertSymToLabel(accepting_split[1]) accepting_weight = convertSymToLabel(accepting_split[2]) is_last_column = nb_consummed_chars == len( ref_string ) # booleen renvoie true si le nombre de caracteres conssommes est egal a la longueur de la chaine et false sinon is_last_row = nb_elementary_operations == levenshtein_distance # booleen renvoie true si le nombre d'operations elementaires est egal a la distance de levenshtein et false sinon if is_last_column and is_last_row: output_arc_label = "epsilon" + ":" + "epsilon" + ":" + str(0) set_arcs[output_arc_label] = [] elif is_last_column: arcs_labels.append(insertion_arc_label) dst_states.append(up_dst_label) set_arcs[insertion_arc_label] = [up_dst_label] automate.add_arc( state_index, fst.Arc(insertion_consummed_char, insertion_transmitted_char, fst.Weight(automate.weight_type(), insertion_weight), up_dst_index)) elif is_last_row: arcs_labels.append(accepting_arc_label) dst_states.append(right_dst_label) set_arcs[accepting_arc_label] = [right_dst_label] automate.add_arc( state_index, fst.Arc(accepting_consummed_char, accepting_transmitted_char, fst.Weight(automate.weight_type(), accepting_weight), right_dst_index)) else: arcs_labels.append(accepting_arc_label) dst_states.append(right_dst_label) set_arcs[accepting_arc_label] = [right_dst_label] automate.add_arc( state_index, fst.Arc(accepting_consummed_char, accepting_transmitted_char, fst.Weight(automate.weight_type(), accepting_weight), right_dst_index)) arcs_labels.append(deletion_arc_label) dst_states.append(diag_dst_label) set_arcs[deletion_arc_label] = [diag_dst_label] automate.add_arc( state_index, fst.Arc(deletion_consummed_char, deletion_transmitted_char, fst.Weight(automate.weight_type(), deletion_weight), diag_dst_index)) arcs_labels.append(substitution_arc_label) dst_states.append(diag_dst_label) automate.add_arc( state_index, fst.Arc( substitution_consummed_char, substitution_transmitted_char, fst.Weight(automate.weight_type(), substitution_weight), diag_dst_index)) arcs_labels.append(insertion_arc_label) dst_states.append(up_dst_label) automate.add_arc( state_index, fst.Arc(insertion_consummed_char, insertion_transmitted_char, fst.Weight(automate.weight_type(), insertion_weight), up_dst_index)) set_arcs[substitution_arc_label] = [ diag_dst_label, up_dst_label ] # insertion et substitution ont les memes labels d'arcs automata[state_label] = set_arcs # print(automata[state_label]) # for idx in range(len(dst_states)): # dst_state_label = dst_states[idx] # dst_state_index = dict_levenshtein_states[dst_state_label] # consummed_char = convertSymToLabel(char_from_ref_str) # dst_states[idx] # transmitted_char = info[1] # weight = info[2] # automate.add_arc( # state_index, # fst.Arc( # transmitted_char, # consummed_char, # fst.Weight(automate.weight_type(), weight), # dst_state_index) # ) print(automata) # Display Automata in LaTeX : return (automata)
def build_lm(dev_fname, isyms_fname, constraints, lattice_output): """ Make a lattice that maps lemmas and constraints (or priors) to an inflected version """ # rewrite constraints constraints = constraints.replace("_", ";") # read isyms input_syms = fst.SymbolTable.read_text(isyms_fname) s_fin = '</s>' code = {} for ltr, c in input_syms: code[c] = ltr # init the lattice f_big = fst.Fst("log") f_big.set_input_symbols(input_syms) f_big.set_output_symbols(input_syms) for line in open(dev_fname, 'r').readlines( ): # all possilbe inflections are added, regardless of the prior (applying the prior an make for a more effecifent computation) line = line.strip() lemma, inflection, cns = line.split("\t")[:-2] #print(lemma, inflection, cns) if cns == constraints: # comparing strings idx = 0 lemma = lemma.split() inflection = inflection.split() for j, (lm, flc) in enumerate(zip(lemma, inflection)): if lm != flc: idx = j break f, old = create_priors(cns, input_syms, input_syms, code) keep = old for j in range(idx, len(lemma)): new = f.add_state() f.add_arc( old, fst.Arc(code[lemma[j]], code[lemma[j]], fst.Weight(f.weight_type(), 1.0), new)) old = new new = f.add_state() # the residual of the lemma is mapped to the inflection residual (indirectly) sym = "".join(lemma[idx:]) + "_" + "".join(inflection[idx:]) f.add_arc( old, fst.Arc(code[sym], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[inflection[idx:]], code[s_fin], fst.Weight(f.weight_type(), 1.0), new)) #f.add_arc(old, fst.Arc(code[s_fin], code[inflection[idx:]], fst.Weight(f.weight_type(), 1.0), new)) f.set_final(new) f_big.union(f) f_big = fst.determinize(f_big.rmepsilon()) # add <sigma> state in the <sigma place holder> for c, ltr in code.items(): if int(ltr) > 1 and int( ltr) < 51: # (hard coded) symbols of Runssian + 2 more f_big.add_arc( keep, fst.Arc(code[c], code[c], fst.Weight(f_big.weight_type(), 1.0), keep)) f_big.invert() # save lattice f_big.write(lattice_output)
def generate_word_sequence_recognition_wfst_bigram(n, lex, df_bigram_prob, original=False, weight_fwd=None, weight_self=None): """ generate a HMM to recognise any single word sequence for words in the lexicon Args: n (int): states per phone HMM original (bool): True/False - origianl/optimized lexicon weight_fwd (int): weight value weight_self (int): weight value of self node Returns: the constructed WFST """ if (weight_fwd != None and weight_self != None): f = fst.Fst('log') none_weight = fst.Weight('log', -math.log(1)) else: f = fst.Fst() none_weight = None lex = parse_lexicon(lex, original) word_table, phone_table, state_table = generate_symbols_table(lex, 3) output_table = generate_output_table(word_table, phone_table) # create a single start state start_state = f.add_state() f.set_start(start_state) # -- dictionaries for initial and last states dict_initial = {} dict_final = {} # make fst for word, phone_list in lex.items(): for phones in phone_list: initial_state = f.add_state() # -- add to initial dict if word in dict_initial: dict_initial[word].append(initial_state) else: dict_initial[word] = [initial_state] # -- add arcs f.add_arc( start_state, fst.Arc(0, output_table.find(word), none_weight, initial_state)) current_state = initial_state for phone in phones: current_state = generate_phone_wfst(f, current_state, phone, n, state_table, output_table, weight_fwd, weight_self) f.set_final(current_state) f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) # -- add to final dict if word in dict_final: dict_final[word].append(current_state) else: dict_final[word] = [current_state] # -- add bidirectional arcs for word, last_state_list in dict_final.items( ): # list of final states 4 word for last_state in last_state_list: # final state from lsit for word_bi, initial_state_list in dict_initial.items( ): # list of initial satates for initial_state in initial_state_list: # state from list prob = df_bigram_prob['Word After', word_bi]['Word Before', word] if (prob == 0): prob = 1e10 else: prob = -math.log(prob) weight = fst.Weight('log', prob) f.add_arc( last_state, fst.Arc(0, output_table.find(word_bi), weight, initial_state)) f.set_input_symbols(state_table) f.set_output_symbols(output_table) return f, word_table
# -*- coding: utf-8 -*- import bisect import matplotlib.pyplot as plt import numpy as np import pyparsing import graphviz import dot2tex import openfst_python as fst automate = fst.Fst() def SimpleAutomata(): src_state_label = "0;0" src_state_index = automate.add_state() dst_state_label = "0;1" dst_state_index = automate.add_state() arc_label = "2:4:1" label_string = arc_label.split(":") consummed_char = 2 # int(label_string[0]) transmitted_char = 4 # int(label_string[1]) weight = 1 # int(label_string[2]) automate.add_arc( src_state_index, fst.Arc(transmitted_char, consummed_char, fst.Weight(automate.weight_type(), weight), dst_state_index)) print(automate)
def generate_WFST_silent(n, lex, weight_fwd, weight_self, original=False): """ generate a HMM to recognise any single word sequence for words in the lexicon and includes a silence state Args: n (int): states per phone HMM original (bool): True/False - origianl/optimized lexicon weight_fwd (int): weight value weight_self (int): weight value of self node weights_start (dict): word -> probability of word Returns: the constructed WFST """ f = fst.Fst('log') none_weight = fst.Weight('log', -math.log(1)) original_lex = parse_lexicon(lex, original) # add the silent states silent_word = '<silence>' silent_phones = ['sil_0', 'sil_1', 'sil_2', 'sil_3', 'sil_4', 'sil_5'] silence_lex = original_lex.copy() silence_lex[silent_word] = [silent_phones ] # makes sure output table contains it # ----- # print(f"lex: {silence_lex}") word_table, phone_table, state_table = generate_symbols_table( original_lex, 3) word_table.add_symbol(silent_word) for phone in silent_phones: state_table.add_symbol(phone) phone_table.add_symbol('sil') # print(f'state table: {list(state_table)}') output_table = generate_output_table(word_table, phone_table) # create a single start state start_state = f.add_state() f.set_start(start_state) # skip silent phones by using original lex for word, phone_list in original_lex.items(): for phones in phone_list: initial_state = f.add_state() f.add_arc( start_state, fst.Arc(0, output_table.find(word), none_weight, initial_state)) current_state = initial_state for phone in phones: current_state = generate_phone_wfst(f, current_state, phone, n, state_table, output_table, weight_fwd, weight_self) f.set_final(current_state) f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) # need to add the silent state seperately current_state = f.add_state() f.add_arc( start_state, fst.Arc(0, output_table.find(silent_word), none_weight, current_state)) current_state = generate_silent_phone_wfst(f, current_state, state_table, output_table) f.set_final(current_state) f.add_arc(current_state, fst.Arc(0, 0, none_weight, start_state)) f.set_input_symbols(state_table) f.set_output_symbols(output_table) return f, word_table
# -*- coding: utf-8 -*- import bisect import matplotlib.pyplot as plt import numpy as np import pyparsing import graphviz import dot2tex import openfst_python as fst def printTxt(txt): print(txt) automate = fst.Fst() # creation de l'automate automate_states = { } # dictionnaire contenant tous les etats de l'automate : les cles sont les labels des etats et les valeurs sont les etats crees # La fonction add_automate_state est appelee lors du parcours de l'automate et permet de creer un dict contenant tous les etats de l'automate une et une fois seulement # Si le label fait deja partie du dictionnaire alors, elle ne fait rien, sinon elle ajoute le nouveau label et cree l'etat correspondant avec add_state() def add_automate_state(state_label, state_index): if (state_label not in automate_states): automate_states[state_label] = [automate.add_state(), state_index] state_index += 1 return state_index def create_states_dico(ref_string, levenshtein_distance): dict_levenshtein_states = {} for column in range(len(ref_string) + 1):