Esempio n. 1
0
def lat2flat(latfile, fsgfile, lmfst):
    """
    Subset a language model using the vocabulary of a lattice.
    """
    dag = lattice.Dag(latfile)
    fst = openfst.StdVectorFst()
    fst.SetStart(fst.AddState())
    fst.SetFinal(0, 0)
    syms = lmfst.InputSymbols()
    seen = set()
    for n in dag.nodes:
        # Skip fillers as they have been "bypassed" by PocketSphinx
        if n.sym.startswith("++") or n.sym == "<sil>":
            continue
        if n.sym in seen:
            continue
        seen.add(n.sym)
        sym = syms.Find(baseword(n.sym))
        if sym == -1:
            continue
        fst.AddArc(0, sym, sym, 0, 0)
    fst.SetOutputSymbols(lmfst.InputSymbols())
    phi = lmfst.InputSymbols().Find("&phi;")
    if phi != -1:
        opts = openfst.StdPhiComposeOptions()
        opts.matcher1 = openfst.StdPhiMatcher(fst, openfst.MATCH_NONE)
        opts.matcher2 = openfst.StdPhiMatcher(lmfst, openfst.MATCH_INPUT, phi)
        cfst = openfst.StdComposeFst(fst, lmfst, opts)
    else:
        cfst = openfst.StdComposeFst(fst, lmfst)
    outfst = openfst.StdVectorFst()
    openfst.Determinize(cfst, outfst)
    # Write it back out as an FSG for PocketSphinx.
    build_fsg_fst(outfst, fsgfile)
    return outfst
Esempio n. 2
0
def strset2fst(strs, fstclass=openfst.StdVectorFst):
    """
    Build a dictionary lookup FST for a set of strings.
    """
    fst = fstclass()
    isyms = openfst.SymbolTable("chars")
    osyms = openfst.SymbolTable("words")
    isyms.AddSymbol("&epsilon;")
    osyms.AddSymbol("&epsilon;")
    start = fst.AddState()
    fst.SetStart(start)
    for s in strs:
        prev = start
        for c in s:
            nxt = fst.AddState()
            isym = isyms.AddSymbol(c)
            fst.AddArc(prev, isym, 0, 0, nxt)
            prev = nxt
        nxt = fst.AddState()
        osym = osyms.AddSymbol(s)
        fst.AddArc(prev, 0, osym, 0, nxt)
        fst.SetFinal(nxt, 0)
    dfst = fstclass()
    openfst.Determinize(fst, dfst)
    openfst.RmEpsilon(dfst)
    dfst.SetInputSymbols(isyms)
    dfst.SetOutputSymbols(osyms)
    return dfst
def optimize_openfst(fst, optimize=1):
    """Returns a minimized version of the input fst.  The result
    may or may not be identical to the input."""
    if optimize == 0:
        return fst
    elif optimize == 1:
        det = Fst()
        mapper = openfst.StdEncodeMapper(openfst.kEncodeLabels, openfst.ENCODE)
        openfst.Encode(fst, mapper)
        openfst.RmEpsilon(fst)
        openfst.Determinize(fst, det)
        openfst.Minimize(det)
        fst = det
        openfst.Decode(fst, mapper)
    elif optimize == 2:
        det = Fst()
        openfst.RmEpsilon(fst)
        openfst.Determinize(fst, det)
        openfst.Minimize(det)
        fst = det
    return fst
def make_line_fst(lines):
    fst = Fst()
    count = 0
    for line in lines:
        count += 1
        try:
            add_line(fst,line)
        except:
            print "on line",count
            raise
    det = Fst()
    openfst.Determinize(fst,det)
    openfst.Minimize(det)
    temp = "/tmp/%d.fst"%os.getpid()
    det.Write(temp)
    result = ocropus.make_OcroFST()
    result.load(temp)
    os.unlink(temp)
    return result
Esempio n. 5
0
def log_minimize(fst):
    dfst = LFST()
    openfst.Determinize(fst, dfst)
    openfst.Minimize(dfst)
    return dfst
Esempio n. 6
0
def build_lattice_fsg(dag, syms=None, ascale=0, pscale=0,
                      addsyms=False, determinize=True,
                      baseword=baseword):
    """
    Build an FSM from a Sphinx word lattice.
    """
    fst = openfst.StdVectorFst()
    if syms == None:
        fsgsyms = openfst.SymbolTable("words")
        fsgsyms.AddSymbol("&epsilon;")
        fsgsyms.AddSymbol("&sigma;")
        fsgsyms.AddSymbol("&rho;")
        fsgsyms.AddSymbol("&phi;")
        addsyms = True
    else:
        fsgsyms = syms
    statemap = {}
    j = 0
    for n in dag.nodes:
        # Skip fillers as they have been "bypassed" by PocketSphinx
        if n.sym.startswith("++") or n.sym == "<sil>":
            continue
        # These should not exist, but they do (!!)
        if n.sym == "<s>" and n.entry != 0:
            continue
        if n not in statemap:
            statemap[n] = fst.AddState()
        if addsyms:
            fsgsyms.AddSymbol(baseword(n.sym))
    statemap[dag.start] = fst.AddState()
    fst.SetStart(statemap[dag.start])
    for n in dag.nodes:
        if n not in statemap:
            continue
        sym = fsgsyms.Find(baseword(n.sym))
        for x in n.exits:
            if x.dest not in statemap:
                continue
            weight = 0
            # Turn OOVs and non-events into epsilons
            if sym == -1 or n.sym == "<s>":
                sym = 0
            if ascale:
                weight = -x.ascr * ascale
            elif pscale:
                weight = -x.post * pscale
            fst.AddArc(statemap[x.src], sym, sym, weight, statemap[x.dest])
    # Add a </s> transition if none exists
    if '</s>' not in [x.src.sym for x in dag.end.entries]:
        end = fst.AddState()
        sym = fsgsyms.AddSymbol("</s>")
        fst.AddArc(statemap[dag.end], sym, sym, 0, end)
        fst.SetFinal(end, 0)
    else:
        fst.SetFinal(statemap[dag.end], 0)
    # Epsilon-remove it (like bypassing fillers...) (FIXME: umm...)
    openfst.RmEpsilon(fst)
    # Don't determinize if it's weighted
    if ascale or pscale:
        determinize = False
    if determinize:
        outfst = openfst.StdVectorFst()
        openfst.Determinize(fst, outfst)
        fst = outfst
    fst.SetInputSymbols(fsgsyms)
    fst.SetOutputSymbols(fsgsyms)
    return fst