def get_transducer_symbol_table(self): transducer_symbol_table = fst.SymbolTable() all_segments_string = "".join(self.get_segments_symbols()) all_segments_string += LEFT_APPLICATION_BRACKET + LEFT_CENTER_BRACKET + LEFT_IDENTITY_BRACKET all_segments_string += RIGHT_APPLICATION_BRACKET + RIGHT_CENTER_BRACKET + RIGHT_IDENTITY_BRACKET fst.linear_chain(all_segments_string, syms=transducer_symbol_table) return transducer_symbol_table
def letter_constraint (letterstring,syms): "Create an FSA that has the already typed letters followed by sigma*" thisfst=fst.linear_chain(letterstring,syms); sigmafst=sigma(syms).closure() thisfst.concatenate(sigmafst) thisfst.remove_epsilon() return thisfst.determinize()
def load_nbest_iter(self, nbest_file): """ cat >text.fst <<EOF 0 1 a x .5 0 1 b y 1.5 1 2 c z 2.5 2 3.5 EOF """ a = fst.Acceptor() sigma = fst.SymbolTable() with open(nbest_file, 'r') as f_in: for line in f_in: line = line.strip() b = fst.linear_chain(line.split(), sigma) a = a.union(b) a.remove_epsilon() a = a.determinize() a.minimize() self.fsa = a self.syms = sigma
def extract_from_poses(pos_seq, phrase_transducer, vocab=None, draw_composition=None, **kwargs): """returns span indices for phrases""" # t0 = time.time() if isinstance(phrase_transducer, (str,unicode)): phrase_transducer = get_fst(phrase_transducer) if vocab is None: vocab = set(sym for sym,num in phrase_transducer.isyms.items()) oovize = lambda seq: [x if x in vocab else 'O' for x in seq] a = set([(k,v) for k,v in kwargs.iteritems()]) b = set(pos_seq) assert 'O' in vocab pos_seq = oovize(pos_seq) # print "setup elapsed %.3f ms" % ((time.time() - t0)*1e3) input_transducer = fst.linear_chain(pos_seq, syms=phrase_transducer.isyms) #print vocab #print input_transducer composed = input_transducer >> phrase_transducer if len(composed)==0: return [] #draw_pdf(input_transducer, "t.pdf") if draw_composition: draw_pdf(composed, draw_composition) # print "%s states" % len(composed) # print "composition elapsed %.3f ms" % ((time.time() - t0)*1e3) ret = extract_from_composition(composed, **kwargs) ret = list(ret) # might not always be necessary? # print "extraction elapsed %.3f ms" % ((time.time() - t0)*1e3) return ret
def get_acceptor_for_word(self, word, syms): try: return self._word_acceptors_cache[word] except KeyError: acceptor = linear_chain(word, syms=syms) self._word_acceptors_cache[word] = acceptor return acceptor
def distance(a, b): # Compose a o edit transducer o b composed = fst.linear_chain(a, syms) >> edit >> fst.linear_chain(b, syms) # Compute distance distances = composed.shortest_distance(reverse=True) dist = int(distances[0]) # Find best alignment alignment = composed.shortest_path() # Re-order states alignment.top_sort() # Replace <epsilon> -> "-" alignment.relabel({fst.EPSILON: '-'}, {fst.EPSILON: '-'}) # Read alignment on the arcs of the transducer arcs = (next(state.arcs) for state in alignment) labels = ((arc.ilabel, arc.olabel) for arc in arcs) align = [(alignment.isyms.find(x), alignment.osyms.find(y)) for x, y in labels] return dist, align
def test_closure(): t = fst.linear_chain('ab') result = t.closure_plus() eq_(len(result), len(t)) result.remove_epsilon() expected = t + t.closure() expected.remove_epsilon() eq_(result, expected)
def distance(a, b): # Compose a o edit transducer o b composed = fst.linear_chain(a, syms) >> edit >> fst.linear_chain( b, syms) # Compute distance distances = composed.shortest_distance(reverse=True) dist = int(distances[0]) # Find best alignment alignment = composed.shortest_path() # Re-order states alignment.top_sort() # Replace <epsilon> -> "-" alignment.relabel({fst.EPSILON: '-'}, {fst.EPSILON: '-'}) # Read alignment on the arcs of the transducer arcs = (next(state.arcs) for state in alignment) labels = ((arc.ilabel, arc.olabel) for arc in arcs) align = [(alignment.isyms.find(x), alignment.osyms.find(y)) for x, y in labels] return dist, align
def create_symbol_table(words): symbol_set = set() for word in words: for symbol in list(word): if symbol not in symbol_set: symbol_set.add(symbol) symbol_fst = fst.linear_chain(''.join(symbol_set)) return symbol_fst.isyms
def create_min_fst(words, symbol_table): fst_union = fst.StdVectorFst(isyms=symbol_table, osyms=symbol_table) for word in words: fst_union |= fst.linear_chain(word, symbol_table) print '- naive FST has {0} states and {1} arcs'.format(*automata_size(fst_union)) fst_union.remove_epsilon() dfst_union = fst_union.determinize() dfst_union.minimize() print '- minimized FST has {0} states and {1} arcs'.format(*automata_size(dfst_union)) return dfst_union
def test_simple(): t = fst.Transducer() for i, (ic, oc) in enumerate(zip('hello', 'olleh')): t.add_arc(i, i+1, ic, oc) t[i+1].final = True eq_(len(t), 6) ok_(t[5].final) a = fst.Acceptor() for i, c in enumerate('hello'): a.add_arc(i, i+1, c) a[i+1].final = True eq_(len(a), 6) ok_(a[5].final) l = fst.linear_chain('hello') eq_(a, l)
def extract_from_poses(pos_seq, phrase_transducer, vocab=None, draw_composition=None, tagset='auto', **kwargs): """ tagset could be: 'ark', 'ptb', 'coarse', 'auto' ark: Gimpel et al 2011's twitter tagset ptb: Penn Treebank coarse: Petrov et al's Universal POS tagset auto: try to detect. this may waste time. (todo: use NLTK's tagset naming system. nathan contributed to it) RETURNS: span indices for phrases """ # t0 = time.time() if isinstance(phrase_transducer, (str,unicode)): tname = phrase_transducer phrase_transducer = get_fst(tname) vocab = FSTS[tname + "_vocab"] if vocab is None: vocab = set(sym for sym,num in phrase_transducer.isyms.items()) # print "0setup elapsed %.3f ms" % ((time.time() - t0)*1e3) # t0 = time.time() pos_seq = preprocess_tags(pos_seq, vocab, tagset) # print "1tagpreproc elapsed %.3f ms" % ((time.time() - t0)*1e3) # t0 = time.time() input_transducer = fst.linear_chain(pos_seq, syms=phrase_transducer.isyms) # print "2linchain elapsed %.3f ms" % ((time.time() - t0)*1e3) # t0 = time.time() composed = input_transducer >> phrase_transducer if len(composed)==0: return [] if draw_composition: draw_pdf(composed, draw_composition) # print "3composition elapsed %.3f ms" % ((time.time() - t0)*1e3) # t0 = time.time() ret = extract_from_composition2(composed, **kwargs) # print "4extraction elapsed %.3f ms" % ((time.time() - t0)*1e3) return ret
def load_nbest(self, nbest_file): """ cat >text.fst <<EOF 0 1 a x .5 0 1 b y 1.5 1 2 c z 2.5 2 3.5 EOF """ a = fst.Acceptor(syms=self.syms) with open(nbest_file, 'r') as f_in: for line in f_in: line = line.strip() b = fst.linear_chain(line.split(), self.syms) a = a.union(b) a.remove_epsilon() d = a.determinize() d.minimize() self.fsa = d self.fsa.top_sort()
def PrintOutputsForInput(transducer, input_str): inp = fst.linear_chain(input_str, syms=transducer.isyms, semiring=semiring) combined = (inp >> transducer) PrintFullPaths(combined)
pron = oov_dict[unicode(w,'utf-8')] except KeyError: '''ipa_pron = False #print 'word is OOV, attempting phonetisaurus lookup' w = unicode(w, 'utf-8') pron = subprocess.check_output(['phonetisaurus-g2p', '--model='+AR_FST_FILE,'--input='+w.encode('utf-8')]) pron = pron.rstrip() endnum = pron.index('\t') pron = pron[endnum+1:] pron = ''.join(pron.split()) pron = pron.decode('utf-8') #manually handle characters not in my phonetisaurus fst pron = pron.replace(u'\u0644', u'\006c') pron = pron.replace(u'\u0622', 'CA') print 'reverting to phonetisaurus' ''' wfst = fst.linear_chain(unicode(w,'utf-8'), syms=alt_ar_fst.isyms) ppath = wfst.compose(alt_ar_fst).shortest_path(1) pron = '' for path in ppath: pron += ''.join(ppath.osyms.find(arc.olabel) for arc in path) #try: pron = pron.encode('utf-8') pron = pron.replace('\x06', '') #if ipa_pron: pron = unicode(pron, 'utf-8') longcs = [] #long consonants to be replaced extra_long = [] #long consonants that would be replaced by two colons if this didn't exist for i in range(len(pron)-1): if pron[i] == pron[i+1] and pron[i] not in [u'a', u'u', u'i',u"\u02D0"]: longcs.append(pron[i]) if i < len(pron)-2 and pron[i+2] == u"\u02D0":
pron = oov_dict[unicode(w, 'utf-8')] except KeyError: '''ipa_pron = False #print 'word is OOV, attempting phonetisaurus lookup' w = unicode(w, 'utf-8') pron = subprocess.check_output(['phonetisaurus-g2p', '--model='+AR_FST_FILE,'--input='+w.encode('utf-8')]) pron = pron.rstrip() endnum = pron.index('\t') pron = pron[endnum+1:] pron = ''.join(pron.split()) pron = pron.decode('utf-8') #manually handle characters not in my phonetisaurus fst pron = pron.replace(u'\u0644', u'\006c') pron = pron.replace(u'\u0622', 'CA') print 'reverting to phonetisaurus' ''' wfst = fst.linear_chain(unicode(w, 'utf-8'), syms=alt_ar_fst.isyms) ppath = wfst.compose(alt_ar_fst).shortest_path(1) pron = '' for path in ppath: pron += ''.join( ppath.osyms.find(arc.olabel) for arc in path) #try: pron = pron.encode('utf-8') pron = pron.replace('\x06', '') #if ipa_pron: pron = unicode(pron, 'utf-8') longcs = [] #long consonants to be replaced extra_long = [ ] #long consonants that would be replaced by two colons if this didn't exist for i in range(len(pron) - 1): if pron[i] == pron[i + 1] and pron[i] not in [