def add_ngram_arcs(fst, symtab, lm, n, sidtab): """ Add states and arcs for all N-grams in the language model, where N=N (the order of the model, that is). """ for ng in lm.mgrams(n - 1): wsym = symtab.Find(ng.words[n - 1]) if wsym == -1: # OOV continue if ng.words[n - 1] == '<s>': # non-event continue if '</s>' in ng.words[0:n - 1]: continue for w in ng.words[:n - 1]: # skip OOVs if symtab.Find(w) == -1: #print w, "not found" continue src = sidtab[tuple(ng.words[:n - 1])] # Find longest suffix N-gram that exists spos = 1 while tuple(ng.words[spos:]) not in sidtab: spos += 1 if spos == n: raise RuntimeError, "Unable to find suffix N-gram for", ng.wids dest = sidtab[tuple(ng.words[spos:])] fst.AddArc(src, openfst.StdArc(wsym, wsym, -ng.log_prob, dest))
def build_dictfst(lmfst): """ Build a character-to-word FST based on the symbol table of lmfst. """ insym = openfst.SymbolTable("letters") insym.AddSymbol("ε") outsym = lmfst.InputSymbols() fst = openfst.StdVectorFst() start = fst.AddState() fst.SetStart(start) final = fst.AddState() fst.SetFinal(final, 0) for w, wsym in outsym: if wsym == 0: continue # Use a single symbol for end-of-sentence if w == '</s>': w = [ w, ] for c in w: csym = insym.AddSymbol(c) for w, wsym in outsym: if wsym == 0: continue wsym = outsym.Find(w) # Add an epsilon:word arc to the first state of this word prev = fst.AddState() fst.AddArc(start, openfst.StdArc(0, wsym, 0, prev)) # Use a single symbol for end-of-sentence if w == '</s>': w = [ w, ] for c in w: csym = insym.Find(c) next = fst.AddState() fst.AddArc(prev, openfst.StdArc(csym, 0, 0, next)) prev = next # And an epsilon arc to the final state fst.AddArc(prev, openfst.StdArc(0, 0, 0, final)) fst.SetInputSymbols(insym) fst.SetOutputSymbols(outsym) return fst
from optparse import OptionParser parser = OptionParser(usage="%prog CTL LATDIR") parser.add_option("--prune", type="int", default=5) parser.add_option("--errprune", type="float", default=1e-5) parser.add_option("--errpen", type="float", default=1.0) parser.add_option("--outext") parser.add_option("--errfst") opts, args = parser.parse_args(sys.argv[1:]) ctlfile, latdir = args errfst = None if opts.errfst: errfst = openfst.StdVectorFst.Read(opts.errfst) openfst.Prune(errfst, opts.errprune) sigma = errfst.InputSymbols().Find("σ") for node in errfst: for ai in errfst.mutable_iterarcs(node): arc = ai.Value() if arc.ilabel == sigma: continue ai.SetValue(openfst.StdArc(arc.ilabel, arc.olabel, arc.weight.Value() + opts.errpen, arc.nextstate)) openfst.ArcSortInput(errfst) if opts.outext == None: opts.outext = ".fsg%d" % opts.prune for spam in file(ctlfile): latfile = os.path.join(latdir, spam.strip() + ".slf") fsgfile = os.path.join(latdir, spam.strip() + opts.outext) print spam, ofst = lat2fsg_posterior(latfile, fsgfile, opts.prune, errfst)
def add_mgram_states(fst, symtab, lm, m, sidtab, bo_label=0): """ Add states and arcs for all M-grams in the language model, where M<N. """ for mg in lm.mgrams(m): wsym = symtab.Find(mg.words[m]) if wsym == -1: continue # skip mgrams ending in OOV if m > 0 and mg.words[0] == '</s>': continue # skip >1-grams starting with </s> if m == 0: src = 0 # 1-grams start in backoff state elif tuple(mg.words[0:m]) not in sidtab: continue # this means it has an OOV else: src = sidtab[tuple(mg.words[0:m])] if mg.words[m] == '</s>': # only one final state is allowed final = True newstate = False if ('</s>', ) in sidtab: dest = sidtab[('</s>', )] else: dest = fst.AddState() fst.SetFinal(dest, 0) sidtab[('</s>', )] = dest #print "Final state", dest #print "Entered state ID mapping (</s>,) =>", dest else: final = False newstate = True dest = fst.AddState() if mg.words[m] == '<s>': # <s> is a non-event if m == 0: # The destination state will be the initial state fst.SetStart(dest) #print "Initial state", dest else: fst.AddArc(src, openfst.StdArc(wsym, wsym, -mg.log_prob, dest)) #print "Added %d-gram arc %d => %d %s/%.4f" % (m+1, src, dest, #mg.words[m], -mg.log_prob) if newstate: # Add a new state to the mapping if needed sidtab[tuple(mg.words)] = dest #print "Entered state ID mapping", tuple(mg.words), "=>", dest if not final: # Create a backoff arc to the suffix M-1-gram # Note taht if mg.log_bowt == 0 it's particularly important to do this! if m == 0: bo_state = 0 # backoff state elif tuple(mg.words[1:]) in sidtab: bo_state = sidtab[tuple(mg.words[1:])] else: continue # Not a 1-gram, no suffix M-gram fst.AddArc( dest, openfst.StdArc(bo_label, bo_label, -mg.log_bowt, bo_state))