Ejemplo n.º 1
0
def add_ngram_arcs(fst, symtab, lm, n, sidtab):
    """
    Add states and arcs for all N-grams in the language model, where
    N=N (the order of the model, that is).
    """
    for ng in lm.mgrams(n - 1):
        wsym = symtab.Find(ng.words[n - 1])
        if wsym == -1:  # OOV
            continue
        if ng.words[n - 1] == '<s>':  # non-event
            continue
        if '</s>' in ng.words[0:n - 1]:
            continue
        for w in ng.words[:n - 1]:  # skip OOVs
            if symtab.Find(w) == -1:
                #print w, "not found"
                continue
        src = sidtab[tuple(ng.words[:n - 1])]
        # Find longest suffix N-gram that exists
        spos = 1
        while tuple(ng.words[spos:]) not in sidtab:
            spos += 1
            if spos == n:
                raise RuntimeError, "Unable to find suffix N-gram for", ng.wids
        dest = sidtab[tuple(ng.words[spos:])]
        fst.AddArc(src, openfst.StdArc(wsym, wsym, -ng.log_prob, dest))
Ejemplo n.º 2
0
def build_dictfst(lmfst):
    """
    Build a character-to-word FST based on the symbol table of lmfst.
    """
    insym = openfst.SymbolTable("letters")
    insym.AddSymbol("&epsilon;")
    outsym = lmfst.InputSymbols()
    fst = openfst.StdVectorFst()
    start = fst.AddState()
    fst.SetStart(start)
    final = fst.AddState()
    fst.SetFinal(final, 0)

    for w, wsym in outsym:
        if wsym == 0:
            continue
        # Use a single symbol for end-of-sentence
        if w == '</s>':
            w = [
                w,
            ]
        for c in w:
            csym = insym.AddSymbol(c)

    for w, wsym in outsym:
        if wsym == 0:
            continue
        wsym = outsym.Find(w)
        # Add an epsilon:word arc to the first state of this word
        prev = fst.AddState()
        fst.AddArc(start, openfst.StdArc(0, wsym, 0, prev))
        # Use a single symbol for end-of-sentence
        if w == '</s>':
            w = [
                w,
            ]
        for c in w:
            csym = insym.Find(c)
            next = fst.AddState()
            fst.AddArc(prev, openfst.StdArc(csym, 0, 0, next))
            prev = next
        # And an epsilon arc to the final state
        fst.AddArc(prev, openfst.StdArc(0, 0, 0, final))
    fst.SetInputSymbols(insym)
    fst.SetOutputSymbols(outsym)
    return fst
Ejemplo n.º 3
0
    from optparse import OptionParser
    parser = OptionParser(usage="%prog CTL LATDIR")
    parser.add_option("--prune", type="int", default=5)
    parser.add_option("--errprune", type="float", default=1e-5)
    parser.add_option("--errpen", type="float", default=1.0)
    parser.add_option("--outext")
    parser.add_option("--errfst")
    opts, args = parser.parse_args(sys.argv[1:])
    ctlfile, latdir = args
    errfst = None
    if opts.errfst:
        errfst = openfst.StdVectorFst.Read(opts.errfst)
        openfst.Prune(errfst, opts.errprune)
        sigma = errfst.InputSymbols().Find("&sigma;")
        for node in errfst:
            for ai in errfst.mutable_iterarcs(node):
                arc = ai.Value()
                if arc.ilabel == sigma:
                    continue
                ai.SetValue(openfst.StdArc(arc.ilabel, arc.olabel, 
                                           arc.weight.Value() + opts.errpen,
                                           arc.nextstate))
        openfst.ArcSortInput(errfst)
    if opts.outext == None:
        opts.outext = ".fsg%d" % opts.prune
    for spam in file(ctlfile):
        latfile = os.path.join(latdir, spam.strip() + ".slf")
        fsgfile = os.path.join(latdir, spam.strip() + opts.outext)
        print spam,
        ofst = lat2fsg_posterior(latfile, fsgfile, opts.prune, errfst)
Ejemplo n.º 4
0
def add_mgram_states(fst, symtab, lm, m, sidtab, bo_label=0):
    """
    Add states and arcs for all M-grams in the language model, where M<N.
    """
    for mg in lm.mgrams(m):
        wsym = symtab.Find(mg.words[m])
        if wsym == -1:
            continue  # skip mgrams ending in OOV
        if m > 0 and mg.words[0] == '</s>':
            continue  # skip >1-grams starting with </s>
        if m == 0:
            src = 0  # 1-grams start in backoff state
        elif tuple(mg.words[0:m]) not in sidtab:
            continue  # this means it has an OOV
        else:
            src = sidtab[tuple(mg.words[0:m])]
        if mg.words[m] == '</s>':
            # only one final state is allowed
            final = True
            newstate = False
            if ('</s>', ) in sidtab:
                dest = sidtab[('</s>', )]
            else:
                dest = fst.AddState()
                fst.SetFinal(dest, 0)
                sidtab[('</s>', )] = dest
                #print "Final state", dest
                #print "Entered state ID mapping (</s>,) =>", dest
        else:
            final = False
            newstate = True
            dest = fst.AddState()

        if mg.words[m] == '<s>':
            # <s> is a non-event
            if m == 0:
                # The destination state will be the initial state
                fst.SetStart(dest)
                #print "Initial state", dest
        else:
            fst.AddArc(src, openfst.StdArc(wsym, wsym, -mg.log_prob, dest))
            #print "Added %d-gram arc %d => %d %s/%.4f" % (m+1, src, dest,
            #mg.words[m], -mg.log_prob)

        if newstate:
            # Add a new state to the mapping if needed
            sidtab[tuple(mg.words)] = dest
            #print "Entered state ID mapping", tuple(mg.words), "=>", dest

        if not final:
            # Create a backoff arc to the suffix M-1-gram
            # Note taht if mg.log_bowt == 0 it's particularly important to do this!
            if m == 0:
                bo_state = 0  # backoff state
            elif tuple(mg.words[1:]) in sidtab:
                bo_state = sidtab[tuple(mg.words[1:])]
            else:
                continue  # Not a 1-gram, no suffix M-gram
            fst.AddArc(
                dest, openfst.StdArc(bo_label, bo_label, -mg.log_bowt,
                                     bo_state))