def lat2flat(latfile, fsgfile, lmfst): """ Subset a language model using the vocabulary of a lattice. """ dag = lattice.Dag(latfile) fst = openfst.StdVectorFst() fst.SetStart(fst.AddState()) fst.SetFinal(0, 0) syms = lmfst.InputSymbols() seen = set() for n in dag.nodes: # Skip fillers as they have been "bypassed" by PocketSphinx if n.sym.startswith("++") or n.sym == "<sil>": continue if n.sym in seen: continue seen.add(n.sym) sym = syms.Find(baseword(n.sym)) if sym == -1: continue fst.AddArc(0, sym, sym, 0, 0) fst.SetOutputSymbols(lmfst.InputSymbols()) phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) cfst = openfst.StdComposeFst(fst, lmfst, opts) else: cfst = openfst.StdComposeFst(fst, lmfst) outfst = openfst.StdVectorFst() openfst.Determinize(cfst, outfst) # Write it back out as an FSG for PocketSphinx. build_fsg_fst(outfst, fsgfile) return outfst
def strset2fst(strs, fstclass=openfst.StdVectorFst): """ Build a dictionary lookup FST for a set of strings. """ fst = fstclass() isyms = openfst.SymbolTable("chars") osyms = openfst.SymbolTable("words") isyms.AddSymbol("ε") osyms.AddSymbol("ε") start = fst.AddState() fst.SetStart(start) for s in strs: prev = start for c in s: nxt = fst.AddState() isym = isyms.AddSymbol(c) fst.AddArc(prev, isym, 0, 0, nxt) prev = nxt nxt = fst.AddState() osym = osyms.AddSymbol(s) fst.AddArc(prev, 0, osym, 0, nxt) fst.SetFinal(nxt, 0) dfst = fstclass() openfst.Determinize(fst, dfst) openfst.RmEpsilon(dfst) dfst.SetInputSymbols(isyms) dfst.SetOutputSymbols(osyms) return dfst
def optimize_openfst(fst, optimize=1): """Returns a minimized version of the input fst. The result may or may not be identical to the input.""" if optimize == 0: return fst elif optimize == 1: det = Fst() mapper = openfst.StdEncodeMapper(openfst.kEncodeLabels, openfst.ENCODE) openfst.Encode(fst, mapper) openfst.RmEpsilon(fst) openfst.Determinize(fst, det) openfst.Minimize(det) fst = det openfst.Decode(fst, mapper) elif optimize == 2: det = Fst() openfst.RmEpsilon(fst) openfst.Determinize(fst, det) openfst.Minimize(det) fst = det return fst
def make_line_fst(lines): fst = Fst() count = 0 for line in lines: count += 1 try: add_line(fst,line) except: print "on line",count raise det = Fst() openfst.Determinize(fst,det) openfst.Minimize(det) temp = "/tmp/%d.fst"%os.getpid() det.Write(temp) result = ocropus.make_OcroFST() result.load(temp) os.unlink(temp) return result
def log_minimize(fst): dfst = LFST() openfst.Determinize(fst, dfst) openfst.Minimize(dfst) return dfst
def build_lattice_fsg(dag, syms=None, ascale=0, pscale=0, addsyms=False, determinize=True, baseword=baseword): """ Build an FSM from a Sphinx word lattice. """ fst = openfst.StdVectorFst() if syms == None: fsgsyms = openfst.SymbolTable("words") fsgsyms.AddSymbol("ε") fsgsyms.AddSymbol("σ") fsgsyms.AddSymbol("ρ") fsgsyms.AddSymbol("φ") addsyms = True else: fsgsyms = syms statemap = {} j = 0 for n in dag.nodes: # Skip fillers as they have been "bypassed" by PocketSphinx if n.sym.startswith("++") or n.sym == "<sil>": continue # These should not exist, but they do (!!) if n.sym == "<s>" and n.entry != 0: continue if n not in statemap: statemap[n] = fst.AddState() if addsyms: fsgsyms.AddSymbol(baseword(n.sym)) statemap[dag.start] = fst.AddState() fst.SetStart(statemap[dag.start]) for n in dag.nodes: if n not in statemap: continue sym = fsgsyms.Find(baseword(n.sym)) for x in n.exits: if x.dest not in statemap: continue weight = 0 # Turn OOVs and non-events into epsilons if sym == -1 or n.sym == "<s>": sym = 0 if ascale: weight = -x.ascr * ascale elif pscale: weight = -x.post * pscale fst.AddArc(statemap[x.src], sym, sym, weight, statemap[x.dest]) # Add a </s> transition if none exists if '</s>' not in [x.src.sym for x in dag.end.entries]: end = fst.AddState() sym = fsgsyms.AddSymbol("</s>") fst.AddArc(statemap[dag.end], sym, sym, 0, end) fst.SetFinal(end, 0) else: fst.SetFinal(statemap[dag.end], 0) # Epsilon-remove it (like bypassing fillers...) (FIXME: umm...) openfst.RmEpsilon(fst) # Don't determinize if it's weighted if ascale or pscale: determinize = False if determinize: outfst = openfst.StdVectorFst() openfst.Determinize(fst, outfst) fst = outfst fst.SetInputSymbols(fsgsyms) fst.SetOutputSymbols(fsgsyms) return fst