def make_edit(sigma): """ Make an edit distance transducer """ # Create transducer syms = fst.SymbolTable() sigma.add('<eps>') edit = fst.StdVectorFst() edit.start = edit.add_state() edit[0].final = True for x in sigma: for y in sigma: if x == y == '<eps>': continue edit.add_arc(0, 0, syms[x], syms[y], (0 if x == y else 1)) # Define edit distance def distance(a, b): # Compose a o edit transducer o b comp = make_input(a, syms) >> edit >> make_input(b, syms) # Compute distance distances = comp.shortest_distance(reverse=True) dist = int(distances[0]) # Find best alignment alignment = comp.shortest_path() # Re-order states alignment.top_sort() # Replace "<eps>" -> "-" dash = syms['-'] eps = syms['<eps>'] alignment.relabel(ipairs=[(eps, dash)], opairs=[(eps, dash)]) arcs = (next(iter(state)) for state in alignment) labels = ((arc.ilabel, arc.olabel) for arc in arcs) align = [(syms.find(x), syms.find(y)) for x, y in labels] return dist, align return distance
def make_compounder(words, word_ids): c = fst.StdVectorFst() c.start = c.add_state() space_id = syms["<space>"] c.add_arc(0, 0, space_id, syms["<eps>"]) c.add_arc(0, 0, space_id, syms["+C+"]) c.add_arc(0, 0, space_id, syms["+D+"]) for word_id in word_ids: c.add_arc(0, 0, word_id, word_id) c[0].final = True return c
def lattice_to_nbest(lat, n=1): """Extract n Python lists of output label ids, which corresponds to n most probable paths. Args: lat(fst.LogVectorFst): or alternatively (StdVectorFst) representing lattice n(int): number of list to be extracted Returns: n-best lists """ # Log semiring -> no best path # Converting the lattice to tropical semiring std_v = fst.StdVectorFst(lat) p = std_v.shortest_path(n) return fst_shortest_path_to_lists(p)
def make_input(word, syms): """ Make a charcter input transducer: [0] =w:w=> 1 =o:o=> 2 =r:r=> 3 =d:d=> (4) """ inp = fst.StdVectorFst() inp.start = inp.add_state() source = inp.start for c in word: dest = inp.add_state() inp.add_arc(source, dest, syms[c], syms[c]) source = dest inp[source].final = True return inp
def make_sentence_fsa(words, word_ids): t = fst.StdVectorFst() t.start = t.add_state() i = 0 space_id = syms["<space>"] for word_id in word_ids: if i > 0: t.add_state() t.add_arc(i, i + 1, space_id, space_id) i += 1 t.add_state() t.add_arc(i, i + 1, word_id, word_id) i += 1 t[i].final = True return t
def load_lat(fn): lat = fst.read(fn) lat = fst.StdVectorFst(lat) return lat
def lattice_to_word_posterior_lists(lat, n=1): # Log semiring -> no best path # Converting the lattice to tropical semiring std_v = fst.StdVectorFst(lat) p = std_v.shortest_path(n) return fst_shortest_path_to_word_lists(p)