Esempio n. 1
0
def strset2fst(strs, fstclass=openfst.StdVectorFst):
    """
    Build a dictionary lookup FST for a set of strings.
    """
    fst = fstclass()
    isyms = openfst.SymbolTable("chars")
    osyms = openfst.SymbolTable("words")
    isyms.AddSymbol("ε")
    osyms.AddSymbol("ε")
    start = fst.AddState()
    fst.SetStart(start)
    for s in strs:
        prev = start
        for c in s:
            nxt = fst.AddState()
            isym = isyms.AddSymbol(c)
            fst.AddArc(prev, isym, 0, 0, nxt)
            prev = nxt
        nxt = fst.AddState()
        osym = osyms.AddSymbol(s)
        fst.AddArc(prev, 0, osym, 0, nxt)
        fst.SetFinal(nxt, 0)
    dfst = fstclass()
    openfst.Determinize(fst, dfst)
    openfst.RmEpsilon(dfst)
    dfst.SetInputSymbols(isyms)
    dfst.SetOutputSymbols(osyms)
    return dfst
Esempio n. 2
0
 def __init__(self, isyms=None, osyms=None, ssyms=None):
     openfst.StdVectorFst.__init__(self)
     if isyms == None:
         isyms = openfst.SymbolTable("inputs")
         isyms.AddSymbol("ε")
     if osyms == None:
         osyms = openfst.SymbolTable("outputs")
         osyms.AddSymbol("ε")
     if ssyms == None:
         ssyms = openfst.SymbolTable("states")
         ssyms.AddSymbol("__START__")
     self.ssyms = ssyms
     self.SetInputSymbols(isyms)
     self.SetOutputSymbols(osyms)
     self.SetStart(self.AddState())
Esempio n. 3
0
def fstcompile(infile):
    fst = openfst.StdVectorFst()
    symtab = openfst.SymbolTable("symbols")
    symtab.AddSymbol("ε")
    statemap = collections.defaultdict(fst.AddState)
    for spam in infile:
        fields = spam.strip().split()
        if len(fields) == 1:
            fst.SetFinal(int(fields[0]), 0)
        elif len(fields) == 2:
            fst.SetFinal(int(fields[0]), float(fields[1]))
        elif len(fields) > 2:
            if len(fields) > 3:
                prob = float(fields[3])
            else:
                prob = 1.0
            if fields[2] == 'eps':
                fields[2] = 'ε'
            sym = symtab.AddSymbol(fields[2])
            src = statemap[fields[0]]
            dest = statemap[fields[1]]
            fst.AddArc(src, sym, sym, -math.log(prob), dest)
    fst.SetStart(0)
    fst.SetInputSymbols(symtab)
    fst.SetOutputSymbols(symtab)
    return fst
Esempio n. 4
0
def sent2fst(txt, fstclass=openfst.StdVectorFst, isyms=None, omitstart=True):
    """
    Convert a list of words, or a string of whitespace-separated
    tokens, to a sentence FST.
    """
    fst = fstclass()
    start = fst.AddState()
    fst.SetStart(start)
    if isyms:
        symtab = isyms
    else:
        symtab = openfst.SymbolTable("words")
        symtab.AddSymbol("ε")
    prev = start
    if isinstance(txt, str):
        txt = txt.split()
    for c in txt:
        if omitstart and c == '<s>':
            continue
        nxt = fst.AddState()
        if isyms:
            sym = isyms.Find(c)
            if sym == -1:
                #print "Warning, unknown word", c
                continue
        else:
            sym = symtab.AddSymbol(c)
        #print prev, sym, nxt
        fst.AddArc(prev, sym, sym, 0, nxt)
        prev = nxt
    fst.SetFinal(nxt, 0)
    fst.SetInputSymbols(symtab)
    fst.SetOutputSymbols(symtab)
    return fst
Esempio n. 5
0
def build_classfst(probdef, isyms=None):
    """
    Build an FST from the classes in a Sphinx probability definition
    file.  This transducer maps words to classes, and can either be
    composed with the input, or pre-composed with the language model.
    In the latter case you can project the resulting transducer to its
    input to obtain an equivalent non-class-based model.
    """
    if not isinstance(probdef, SphinxProbdef):
        probdef = SphinxProbdef(probdef)
    fst = openfst.StdVectorFst()
    if isyms:
        symtab = isyms
    else:
        symtab = openfst.SymbolTable("words")
        symtab.AddSymbol("&epsilon;")
    st = fst.AddState()
    fst.SetStart(st)
    fst.SetFinal(st, 0)
    for word, label in symtab:
        if label == openfst.epsilon:
            continue
        fst.AddArc(st, label, label, 0, st)
    for c in probdef.classes:
        clabel = symtab.AddSymbol(c)
        for word, prob in probdef.classes[c].iteritems():
            wlabel = symtab.AddSymbol(word)
            fst.AddArc(st, wlabel, clabel, -math.log(prob), st)
    fst.SetOutputSymbols(symtab)
    fst.SetInputSymbols(symtab)
    return fst
 def testConvertSymbols(self):
     syms1 = openfst.SymbolTable("syms1")
     syms1.AddSymbol("&epsilon;")
     syms1.AddSymbol("foo", 1)
     syms1.AddSymbol("bar", 2)
     syms2 = openfst.SymbolTable("syms2")
     syms2.AddSymbol("&epsilon;")
     syms2.AddSymbol("bar", 1)
     syms2.AddSymbol("foo", 2)
     self.assertEquals(syms1.Find("foo"), 1)
     self.assertEquals(syms2.Find("foo"), 2)
     self.assertEquals(syms1.Find(1), "foo")
     self.assertEquals(syms2.Find(2), "foo")
     fst = openfst.StdVectorFst()
     st = fst.AddState()
     nst = fst.AddState()
     fst.AddArc(st, 1, 1, 0, nst)
     arc = fst.GetArc(st, 0)
     self.assertEquals(arc.ilabel, 1)
     self.assertEquals(arc.olabel, 1)
     fst.SetInputSymbols(syms1)
     fst.SetOutputSymbols(syms1)
     openfst.ConvertSymbols(fst, syms2, True, True)
     arc = fst.GetArc(st, 0)
     self.assertEquals(arc.ilabel, 2)
     self.assertEquals(arc.olabel, 2)
     openfst.ConvertSymbols(fst, syms1, True, False)
     arc = fst.GetArc(st, 0)
     self.assertEquals(arc.ilabel, 1)
     self.assertEquals(arc.olabel, 2)
     fst.AddArc(st, 42, 69, 0, nst)
     try:
         openfst.ConvertSymbols(fst, syms2, True, False)
     except:
         pass
     else:
         self.Fail("expected failure for unknown symbol")
Esempio n. 7
0
def build_dictfst(lmfst):
    """
    Build a character-to-word FST based on the symbol table of lmfst.
    """
    insym = openfst.SymbolTable("letters")
    insym.AddSymbol("&epsilon;")
    outsym = lmfst.InputSymbols()
    fst = openfst.StdVectorFst()
    start = fst.AddState()
    fst.SetStart(start)
    final = fst.AddState()
    fst.SetFinal(final, 0)

    for w, wsym in outsym:
        if wsym == 0:
            continue
        # Use a single symbol for end-of-sentence
        if w == '</s>':
            w = [
                w,
            ]
        for c in w:
            csym = insym.AddSymbol(c)

    for w, wsym in outsym:
        if wsym == 0:
            continue
        wsym = outsym.Find(w)
        # Add an epsilon:word arc to the first state of this word
        prev = fst.AddState()
        fst.AddArc(start, openfst.StdArc(0, wsym, 0, prev))
        # Use a single symbol for end-of-sentence
        if w == '</s>':
            w = [
                w,
            ]
        for c in w:
            csym = insym.Find(c)
            next = fst.AddState()
            fst.AddArc(prev, openfst.StdArc(csym, 0, 0, next))
            prev = next
        # And an epsilon arc to the final state
        fst.AddArc(prev, openfst.StdArc(0, 0, 0, final))
    fst.SetInputSymbols(insym)
    fst.SetOutputSymbols(outsym)
    return fst
Esempio n. 8
0
def str2fst(txt, fstclass=openfst.StdVectorFst):
    """
    Convert a text string to an FST.
    """
    fst = fstclass()
    start = fst.AddState()
    fst.SetStart(start)
    symtab = openfst.SymbolTable("chars")
    symtab.AddSymbol("&epsilon;")
    prev = start
    for c in txt:
        nxt = fst.AddState()
        sym = symtab.AddSymbol(c)
        fst.AddArc(prev, sym, sym, 0, nxt)
        prev = nxt
    fst.SetFinal(nxt, 0)
    fst.SetInputSymbols(symtab)
    fst.SetOutputSymbols(symtab)
    return fst
Esempio n. 9
0
def build_lmfst(lm, use_phi=False):
    """
    Build an FST recognizer from an N-gram backoff language model.
    """
    fst = openfst.StdVectorFst()
    symtab = openfst.SymbolTable("words")
    epsilon = symtab.AddSymbol("&epsilon;")
    if use_phi:
        phi = symtab.AddSymbol("&phi;")
        bo_label = phi
    else:
        bo_label = epsilon
    for ug in lm.mgrams(0):
        wsym = symtab.AddSymbol(ug.words[0])
    fst.SetInputSymbols(symtab)
    fst.SetOutputSymbols(symtab)
    # The algorithm goes like this:
    #
    # Create a backoff state
    # For M in 1 to N-1:
    #  For each M-gram w(1,M):
    #   Create a state q(1,M)
    #   Create an arc from state q(1,M-1) to q(1,M) with weight P(w(1,M))
    #   Create an arc from state q(1,M) to q(2,M) with weight bowt(w(1,M-1))
    # For each N-gram w(1,N):
    #   Create an arc from state q(1,N-1) to q(2,N) with weight P(w(1,N))

    # Table holding M-gram to state mappings
    sidtab = {}
    fst.AddState()  # guaranteed to be zero (we hope)
    for m in range(lm.get_size() - 1):
        add_mgram_states(fst, symtab, lm, m, sidtab, bo_label)
    add_ngram_arcs(fst, symtab, lm, lm.get_size(), sidtab)

    # Connect and arc-sort the resulting FST
    openfst.Connect(fst)
    openfst.ArcSortInput(fst)
    return fst
Esempio n. 10
0
# -*- coding: utf-8 -*-

from pylab import *
import openfst
from openfst import StdVectorFst as FST
from openfst import LogVectorFst as LFST

ASCII = openfst.SymbolTable("ASCII")

for i in range(127):
    if i == 0:
        ASCII.AddSymbol("ϵ", i)
    elif i <= 32:
        ASCII.AddSymbol("$%02x" % i, i)
    else:
        ASCII.AddSymbol(chr(i), i)


def minimize(fst):
    dfst = FST()
    openfst.Determinize(fst, dfst)
    openfst.Minimize(dfst)
    return dfst


def log_minimize(fst):
    dfst = LFST()
    openfst.Determinize(fst, dfst)
    openfst.Minimize(dfst)
    return dfst
Esempio n. 11
0
def build_lattice_fsg(dag, syms=None, ascale=0, pscale=0,
                      addsyms=False, determinize=True,
                      baseword=baseword):
    """
    Build an FSM from a Sphinx word lattice.
    """
    fst = openfst.StdVectorFst()
    if syms == None:
        fsgsyms = openfst.SymbolTable("words")
        fsgsyms.AddSymbol("&epsilon;")
        fsgsyms.AddSymbol("&sigma;")
        fsgsyms.AddSymbol("&rho;")
        fsgsyms.AddSymbol("&phi;")
        addsyms = True
    else:
        fsgsyms = syms
    statemap = {}
    j = 0
    for n in dag.nodes:
        # Skip fillers as they have been "bypassed" by PocketSphinx
        if n.sym.startswith("++") or n.sym == "<sil>":
            continue
        # These should not exist, but they do (!!)
        if n.sym == "<s>" and n.entry != 0:
            continue
        if n not in statemap:
            statemap[n] = fst.AddState()
        if addsyms:
            fsgsyms.AddSymbol(baseword(n.sym))
    statemap[dag.start] = fst.AddState()
    fst.SetStart(statemap[dag.start])
    for n in dag.nodes:
        if n not in statemap:
            continue
        sym = fsgsyms.Find(baseword(n.sym))
        for x in n.exits:
            if x.dest not in statemap:
                continue
            weight = 0
            # Turn OOVs and non-events into epsilons
            if sym == -1 or n.sym == "<s>":
                sym = 0
            if ascale:
                weight = -x.ascr * ascale
            elif pscale:
                weight = -x.post * pscale
            fst.AddArc(statemap[x.src], sym, sym, weight, statemap[x.dest])
    # Add a </s> transition if none exists
    if '</s>' not in [x.src.sym for x in dag.end.entries]:
        end = fst.AddState()
        sym = fsgsyms.AddSymbol("</s>")
        fst.AddArc(statemap[dag.end], sym, sym, 0, end)
        fst.SetFinal(end, 0)
    else:
        fst.SetFinal(statemap[dag.end], 0)
    # Epsilon-remove it (like bypassing fillers...) (FIXME: umm...)
    openfst.RmEpsilon(fst)
    # Don't determinize if it's weighted
    if ascale or pscale:
        determinize = False
    if determinize:
        outfst = openfst.StdVectorFst()
        openfst.Determinize(fst, outfst)
        fst = outfst
    fst.SetInputSymbols(fsgsyms)
    fst.SetOutputSymbols(fsgsyms)
    return fst