Exemple #1
0
 def get_transducer_symbol_table(self):
     transducer_symbol_table = fst.SymbolTable()
     all_segments_string = "".join(self.get_segments_symbols())
     all_segments_string += LEFT_APPLICATION_BRACKET + LEFT_CENTER_BRACKET + LEFT_IDENTITY_BRACKET
     all_segments_string += RIGHT_APPLICATION_BRACKET + RIGHT_CENTER_BRACKET + RIGHT_IDENTITY_BRACKET
     fst.linear_chain(all_segments_string, syms=transducer_symbol_table)
     return transducer_symbol_table
Exemple #2
0
def letter_constraint (letterstring,syms):
    "Create an FSA that has the already typed letters followed by sigma*"
    thisfst=fst.linear_chain(letterstring,syms);
    sigmafst=sigma(syms).closure()
    thisfst.concatenate(sigmafst)
    thisfst.remove_epsilon()
    return thisfst.determinize()
Exemple #3
0
 def load_nbest_iter(self, nbest_file):
     """
     cat >text.fst <<EOF
     0 1 a x .5
     0 1 b y 1.5
     1 2 c z 2.5
     2 3.5
     EOF
     """
     a = fst.Acceptor()
     sigma = fst.SymbolTable()
     
     with open(nbest_file, 'r') as f_in:
         for line in f_in:
             line = line.strip()
             
             b = fst.linear_chain(line.split(), sigma)
             a = a.union(b)
             
             a.remove_epsilon()
             a = a.determinize()
             a.minimize()
     
     self.fsa = a
     self.syms = sigma
def extract_from_poses(pos_seq, phrase_transducer, vocab=None, draw_composition=None, **kwargs):
    """returns span indices for phrases"""
    # t0 = time.time()
    if isinstance(phrase_transducer, (str,unicode)):
        phrase_transducer = get_fst(phrase_transducer)
    if vocab is None:
        vocab = set(sym for sym,num in phrase_transducer.isyms.items())
    oovize = lambda seq: [x if x in vocab else 'O' for x in seq]
    a = set([(k,v) for k,v in kwargs.iteritems()])
    b = set(pos_seq)
    assert 'O' in vocab

    pos_seq = oovize(pos_seq)
    # print "setup elapsed %.3f ms" % ((time.time() - t0)*1e3)
    input_transducer = fst.linear_chain(pos_seq, syms=phrase_transducer.isyms)
    #print vocab
    #print input_transducer
    composed = input_transducer >> phrase_transducer
    if len(composed)==0: return []
    #draw_pdf(input_transducer, "t.pdf")
    if draw_composition:
        draw_pdf(composed, draw_composition)
    # print "%s states" % len(composed)
    # print "composition elapsed %.3f ms" % ((time.time() - t0)*1e3)
    ret = extract_from_composition(composed, **kwargs)
    ret = list(ret)  # might not always be necessary?
    # print "extraction elapsed %.3f ms" % ((time.time() - t0)*1e3)
    return ret
Exemple #5
0
 def get_acceptor_for_word(self, word, syms):
     try:
         return self._word_acceptors_cache[word]
     except KeyError:
         acceptor = linear_chain(word, syms=syms)
         self._word_acceptors_cache[word] = acceptor
         return acceptor
Exemple #6
0
 def distance(a, b):
     # Compose a o edit transducer o b
     composed = fst.linear_chain(a, syms) >> edit >> fst.linear_chain(b, syms)
     # Compute distance
     distances = composed.shortest_distance(reverse=True)
     dist = int(distances[0])
     # Find best alignment
     alignment = composed.shortest_path()
     # Re-order states
     alignment.top_sort()
     # Replace <epsilon> -> "-"
     alignment.relabel({fst.EPSILON: '-'}, {fst.EPSILON: '-'})
     # Read alignment on the arcs of the transducer
     arcs = (next(state.arcs) for state in alignment)
     labels = ((arc.ilabel, arc.olabel) for arc in arcs)
     align = [(alignment.isyms.find(x), alignment.osyms.find(y)) for x, y in labels]
     return dist, align
Exemple #7
0
def test_closure():
    t = fst.linear_chain('ab')
    result = t.closure_plus()
    eq_(len(result), len(t))
    result.remove_epsilon()
    expected = t + t.closure()
    expected.remove_epsilon()
    eq_(result, expected)
Exemple #8
0
def test_closure():
    t = fst.linear_chain('ab')
    result = t.closure_plus()
    eq_(len(result), len(t))
    result.remove_epsilon()
    expected = t + t.closure()
    expected.remove_epsilon()
    eq_(result, expected)
Exemple #9
0
 def distance(a, b):
     # Compose a o edit transducer o b
     composed = fst.linear_chain(a, syms) >> edit >> fst.linear_chain(
         b, syms)
     # Compute distance
     distances = composed.shortest_distance(reverse=True)
     dist = int(distances[0])
     # Find best alignment
     alignment = composed.shortest_path()
     # Re-order states
     alignment.top_sort()
     # Replace <epsilon> -> "-"
     alignment.relabel({fst.EPSILON: '-'}, {fst.EPSILON: '-'})
     # Read alignment on the arcs of the transducer
     arcs = (next(state.arcs) for state in alignment)
     labels = ((arc.ilabel, arc.olabel) for arc in arcs)
     align = [(alignment.isyms.find(x), alignment.osyms.find(y))
              for x, y in labels]
     return dist, align
Exemple #10
0
def create_symbol_table(words):
    symbol_set = set()
    for word in words:
        for symbol in list(word):
            if symbol not in symbol_set:
                symbol_set.add(symbol)

    symbol_fst = fst.linear_chain(''.join(symbol_set))

    return symbol_fst.isyms
Exemple #11
0
def create_min_fst(words, symbol_table):
    fst_union = fst.StdVectorFst(isyms=symbol_table, osyms=symbol_table)

    for word in words:
        fst_union |= fst.linear_chain(word, symbol_table)

    print '- naive FST has {0} states and {1} arcs'.format(*automata_size(fst_union))

    fst_union.remove_epsilon()
    dfst_union = fst_union.determinize()
    dfst_union.minimize()

    print '- minimized FST has {0} states and {1} arcs'.format(*automata_size(dfst_union))

    return dfst_union
Exemple #12
0
def test_simple():
    t = fst.Transducer()
    for i, (ic, oc) in enumerate(zip('hello', 'olleh')):
        t.add_arc(i, i+1, ic, oc)
    t[i+1].final = True
    eq_(len(t), 6)
    ok_(t[5].final)

    a = fst.Acceptor()
    for i, c in enumerate('hello'):
        a.add_arc(i, i+1, c)
    a[i+1].final = True
    eq_(len(a), 6)
    ok_(a[5].final)

    l = fst.linear_chain('hello')
    eq_(a, l)
Exemple #13
0
def extract_from_poses(pos_seq, phrase_transducer, vocab=None, draw_composition=None, tagset='auto', **kwargs):
    """
    tagset could be:  'ark', 'ptb', 'coarse', 'auto'
        ark: Gimpel et al 2011's twitter tagset
        ptb: Penn Treebank
        coarse: Petrov et al's Universal POS tagset
        auto: try to detect. this may waste time.
    (todo: use NLTK's tagset naming system. nathan contributed to it)
    RETURNS: span indices for phrases
    """
    # t0 = time.time()
    if isinstance(phrase_transducer, (str,unicode)):
        tname = phrase_transducer
        phrase_transducer = get_fst(tname)
        vocab = FSTS[tname + "_vocab"]
    if vocab is None:
        vocab = set(sym for sym,num in phrase_transducer.isyms.items())
    # print "0setup elapsed %.3f ms" % ((time.time() - t0)*1e3)
    # t0 = time.time()

    pos_seq = preprocess_tags(pos_seq, vocab, tagset)
    # print "1tagpreproc elapsed %.3f ms" % ((time.time() - t0)*1e3)
    # t0 = time.time()

    input_transducer = fst.linear_chain(pos_seq, syms=phrase_transducer.isyms)
    # print "2linchain elapsed %.3f ms" % ((time.time() - t0)*1e3)
    # t0 = time.time()

    composed = input_transducer >> phrase_transducer
    if len(composed)==0: return []
    if draw_composition:
        draw_pdf(composed, draw_composition)
    # print "3composition elapsed %.3f ms" % ((time.time() - t0)*1e3)
    # t0 = time.time()

    ret = extract_from_composition2(composed, **kwargs)
    # print "4extraction elapsed %.3f ms" % ((time.time() - t0)*1e3)
    return ret
Exemple #14
0
 def load_nbest(self, nbest_file):
     """
     cat >text.fst <<EOF
     0 1 a x .5
     0 1 b y 1.5
     1 2 c z 2.5
     2 3.5
     EOF
     """
     a = fst.Acceptor(syms=self.syms)
     
     with open(nbest_file, 'r') as f_in:
         for line in f_in:
             line = line.strip()
             
             b = fst.linear_chain(line.split(), self.syms)
             a = a.union(b)
             
     a.remove_epsilon()
     d = a.determinize()
     d.minimize()
     
     self.fsa = d
     self.fsa.top_sort()
Exemple #15
0
def PrintOutputsForInput(transducer, input_str):
    inp = fst.linear_chain(input_str, syms=transducer.isyms, semiring=semiring)
    combined = (inp >> transducer)
    PrintFullPaths(combined)
Exemple #16
0
         pron = oov_dict[unicode(w,'utf-8')]
     except KeyError:
         '''ipa_pron = False
         #print 'word is OOV, attempting phonetisaurus lookup'
         w = unicode(w, 'utf-8')
         pron = subprocess.check_output(['phonetisaurus-g2p', '--model='+AR_FST_FILE,'--input='+w.encode('utf-8')])
         pron = pron.rstrip()
         endnum = pron.index('\t')
         pron = pron[endnum+1:]
         pron = ''.join(pron.split())
         pron = pron.decode('utf-8')
         #manually handle characters not in my phonetisaurus fst
         pron = pron.replace(u'\u0644', u'\006c')
         pron = pron.replace(u'\u0622', 'CA')
         print 'reverting to phonetisaurus' '''
         wfst = fst.linear_chain(unicode(w,'utf-8'), syms=alt_ar_fst.isyms)
         ppath = wfst.compose(alt_ar_fst).shortest_path(1)
         pron = ''
         for path in ppath:
             pron += ''.join(ppath.osyms.find(arc.olabel) for arc in path)
 #try:
 pron = pron.encode('utf-8')
 pron = pron.replace('\x06', '')
 #if ipa_pron:
 pron = unicode(pron, 'utf-8')
 longcs = [] #long consonants to be replaced
 extra_long = [] #long consonants that would be replaced by two colons if this didn't exist
 for i in range(len(pron)-1):
     if pron[i] == pron[i+1] and pron[i] not in [u'a', u'u', u'i',u"\u02D0"]:
         longcs.append(pron[i])
         if i < len(pron)-2 and pron[i+2] == u"\u02D0":
Exemple #17
0
         pron = oov_dict[unicode(w, 'utf-8')]
     except KeyError:
         '''ipa_pron = False
         #print 'word is OOV, attempting phonetisaurus lookup'
         w = unicode(w, 'utf-8')
         pron = subprocess.check_output(['phonetisaurus-g2p', '--model='+AR_FST_FILE,'--input='+w.encode('utf-8')])
         pron = pron.rstrip()
         endnum = pron.index('\t')
         pron = pron[endnum+1:]
         pron = ''.join(pron.split())
         pron = pron.decode('utf-8')
         #manually handle characters not in my phonetisaurus fst
         pron = pron.replace(u'\u0644', u'\006c')
         pron = pron.replace(u'\u0622', 'CA')
         print 'reverting to phonetisaurus' '''
         wfst = fst.linear_chain(unicode(w, 'utf-8'),
                                 syms=alt_ar_fst.isyms)
         ppath = wfst.compose(alt_ar_fst).shortest_path(1)
         pron = ''
         for path in ppath:
             pron += ''.join(
                 ppath.osyms.find(arc.olabel) for arc in path)
 #try:
 pron = pron.encode('utf-8')
 pron = pron.replace('\x06', '')
 #if ipa_pron:
 pron = unicode(pron, 'utf-8')
 longcs = []  #long consonants to be replaced
 extra_long = [
 ]  #long consonants that would be replaced by two colons if this didn't exist
 for i in range(len(pron) - 1):
     if pron[i] == pron[i + 1] and pron[i] not in [