def lat2flat(latfile, fsgfile, lmfst): """ Subset a language model using the vocabulary of a lattice. """ dag = lattice.Dag(latfile) fst = openfst.StdVectorFst() fst.SetStart(fst.AddState()) fst.SetFinal(0, 0) syms = lmfst.InputSymbols() seen = set() for n in dag.nodes: # Skip fillers as they have been "bypassed" by PocketSphinx if n.sym.startswith("++") or n.sym == "<sil>": continue if n.sym in seen: continue seen.add(n.sym) sym = syms.Find(baseword(n.sym)) if sym == -1: continue fst.AddArc(0, sym, sym, 0, 0) fst.SetOutputSymbols(lmfst.InputSymbols()) phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) cfst = openfst.StdComposeFst(fst, lmfst, opts) else: cfst = openfst.StdComposeFst(fst, lmfst) outfst = openfst.StdVectorFst() openfst.Determinize(cfst, outfst) # Write it back out as an FSG for PocketSphinx. build_fsg_fst(outfst, fsgfile) return outfst
def lat2fsg(lat, fsgfile, lmfst, prune=15): if isinstance(lat, str): if lat.endswith(".slf"): dag = lattice.Dag(htkfile=lat) else: dag = lattice.Dag(lat) else: dag = lat fst = build_lattice_fsg(dag, lmfst.InputSymbols()) # Compose it (intersect, really) with the language model to get # correct N-gram scores (otherwise it is just a unigram LM). This # is the same thing as "lattice expansion". phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) cfst = openfst.StdComposeFst(fst, lmfst, opts) else: cfst = openfst.StdComposeFst(fst, lmfst) outfst = openfst.StdVectorFst(cfst) openfst.Prune(outfst, prune) # Write it back out as an FSG for PocketSphinx. build_fsg_fst(outfst, fsgfile) return outfst
def lat_rescore(dag, lmfst, lw=9.5): """ Rescore a lattice using a language model FST. """ fst = lat2fsg.build_lattice_fsg(dag, lmfst.InputSymbols(), 1. / lw) phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) c = openfst.StdComposeFst(fst, lmfst, opts) else: c = openfst.StdComposeFst(fst, lmfst) o = openfst.StdVectorFst() openfst.ShortestPath(c, o, 1) words = ['<s>'] st = o.Start() score = 0 while st != -1 and o.NumArcs(st): a = o.GetArc(st, 0) if a.olabel != 0: words.append(lmfst.InputSymbols().Find(a.ilabel)) score -= a.weight.Value() st = a.nextstate return words, score
def testCompose(self): a = openfst.StdVectorFst() a.AddState() a.AddState() a.AddArc(0, 1, 2, 0, 1) a.AddArc(0, 2, 3, 0, 1) a.AddArc(0, 3, 3, 1, 1) a.SetStart(0) a.SetFinal(1, 0) b = openfst.StdVectorFst() b.AddState() b.AddState() b.AddArc(0, 1, 2, 0, 1) b.AddArc(0, 2, 3, 0, 1) b.AddArc(0, 3, 3, 1, 1) b.SetStart(0) b.SetFinal(1, 0) c = openfst.StdComposeFst(a, b) for s in c: for arc in c.iterarcs(s): self.assertEquals(arc.nextstate, 1) if arc.ilabel == 1: self.assertEquals(arc.olabel, 3) if arc.ilabel == 2: self.assertEquals(arc.olabel, 3) self.assertEquals(arc.weight.Value(), 1) if arc.ilabel == 3: self.assertEquals(arc.olabel, 3) self.assertEquals(arc.weight.Value(), 2)
def testComposePhi(self): a = openfst.StdVectorFst() a.AddState() a.AddState() a.AddArc(0, 2, 2, 0, 1) a.AddArc(0, 3, 3, 0, 1) a.AddArc(0, 4, 4, 0, 1) a.SetStart(0) a.SetFinal(0, 0) b = openfst.StdVectorFst() b.AddState() b.AddState() b.AddArc(0, 1, 1, 1, 1) b.AddArc(0, 2, 2, 0, 0) b.AddArc(1, 3, 3, 0, 1) b.AddArc(1, 4, 4, 0, 1) b.SetStart(0) b.SetFinal(0, 0) b.SetFinal(1, 0) opts = openfst.StdPhiComposeOptions() opts.matcher2 = openfst.StdPhiMatcher(b, openfst.MATCH_INPUT, 1) # This is necessary for reasons I do not understand opts.matcher1 = openfst.StdPhiMatcher(a, openfst.MATCH_NONE) c = openfst.StdComposeFst(a, b, opts) for s in c: for arc in c.iterarcs(s): if arc.ilabel == 2: self.assertEquals(arc.weight.Value(), 0) self.assertEquals(arc.nextstate, 1) elif arc.ilabel == 3 or arc.ilabel == 4: self.assertEquals(arc.weight.Value(), 1) self.assertEquals(arc.nextstate, 2)
def testComposeRho(self): a = openfst.StdVectorFst() a.AddState() a.AddState() a.AddArc(0, 2, 2, 0, 1) a.AddArc(0, 3, 3, 0, 1) a.AddArc(0, 4, 4, 0, 1) a.SetStart(0) a.SetFinal(1, 0) # Build an FST that matches 2 with no weight and everything # else adding weight 1 b = openfst.StdVectorFst() b.AddState() b.AddArc(0, 1, 1, 1, 0) b.AddArc(0, 2, 2, 0, 0) b.SetStart(0) b.SetFinal(0, 0) opts = openfst.StdRhoComposeOptions() opts.matcher2 = openfst.StdRhoMatcher(b, openfst.MATCH_INPUT, 1) # This is necessary for reasons I do not understand opts.matcher1 = openfst.StdRhoMatcher(a, openfst.MATCH_NONE) c = openfst.StdComposeFst(a, b, opts) for s in c: for arc in c.iterarcs(s): self.assertEquals(arc.nextstate, 1) if arc.ilabel == 2: self.assertEquals(arc.weight.Value(), 0) else: self.assertEquals(arc.weight.Value(), 1)
def apply_errfst(fst, errfst): sigma = errfst.InputSymbols().Find("σ") opts = openfst.StdSigmaComposeOptions() opts.matcher1 = openfst.StdSigmaMatcher(fst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdSigmaMatcher(errfst, openfst.MATCH_INPUT, sigma, True) cfst = openfst.StdComposeFst(fst, errfst, opts) cfst = openfst.StdVectorFst(cfst) openfst.ProjectOutput(cfst) return cfst
def build_class_lmfst(lm, probdef, use_phi=False): """ Build an FST from a class-based language model. By default this returns the lazy composition of the class definition transducer and the language model. To obtain the full language model, create a VectorFst from it and project it to its input. """ lmfst = build_lmfst(lm, use_phi) classfst = build_classfst(probdef, lmfst.InputSymbols()) openfst.ArcSortInput(lmfst) openfst.ArcSortInput(classfst) return openfst.StdComposeFst(classfst, lmfst)
def lmfst_eval(lmfst, sent): sentfst = sent2fst(sent, openfst.StdVectorFst, lmfst.InputSymbols()) phi = lmfst.InputSymbols().Find("φ") if phi != -1: opts = openfst.StdPhiComposeOptions() opts.matcher1 = openfst.StdPhiMatcher(sentfst, openfst.MATCH_NONE) opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi) c = openfst.StdComposeFst(sentfst, lmfst, opts) else: c = openfst.StdComposeFst(sentfst, lmfst) o = openfst.StdVectorFst() openfst.ShortestPath(c, o, 1) st = o.Start() ll = 0 while st != -1 and o.NumArcs(st): a = o.GetArc(st, 0) # print o.InputSymbols().Find(a.ilabel), \ # o.OutputSymbols().Find(a.olabel), \ # -a.weight.Value() / math.log(10) ll -= a.weight.Value() st = a.nextstate return ll
baseword=lattice.baseword_noclass) openfst.ArcSortInput(lfst) # Apply Levenshtein model to the input errfst = LevenshteinModel(rfst.OutputSymbols()) openfst.ArcSortInput(errfst) # Apply compound word model based on the lattice compfst = CompoundWordModel(errfst.OutputSymbols(), lfst.InputSymbols()) # Precompose and project it to the lattice so compound words # are split in the alignment xlat = openfst.StdVectorFst() openfst.Compose(compfst, lfst, xlat) openfst.ProjectInput(xlat) openfst.ArcSortInput(xlat) # Compose everything together cfst = openfst.StdComposeFst(rfst, errfst) cfst = openfst.StdComposeFst(cfst, xlat) # Do bestpath search ofst = openfst.StdVectorFst() openfst.ShortestPath(cfst, ofst, 1) st = ofst.Start() err = 0 bt = [] while st != -1 and ofst.NumArcs(st): a = ofst.GetArc(st, 0) isym = ofst.InputSymbols().Find(a.ilabel) osym = ofst.OutputSymbols().Find(a.olabel) if isym == '</s>': break if a.ilabel == openfst.epsilon: isym = '*INS*'