def test_replace(): syms = fst.SymbolTable() a1 = fst.Acceptor(syms) a1.add_arc(0, 1, 'dial') a1.add_arc(1, 2, 'google') a1.add_arc(1, 2, '$name') a1.add_arc(2, 3, 'please') a1[3].final = True a2 = fst.Acceptor(syms) a2.add_arc(0, 1, 'michael') a2.add_arc(1, 2, 'riley') a2.add_arc(0, 1, '$firstname') a2.add_arc(1, 2, '$lastname') a2[2].final = True a3 = fst.Acceptor(syms) a3.add_arc(0, 1, 'johan') a3[1].final = True a4 = fst.Acceptor(syms) a4.add_arc(0, 1, 'schalkwyk') a4[1].final = True result = a1.replace({ '$name': a2, '$firstname': a3, '$lastname': a4 }, epsilon=True) result.remove_epsilon() expected = fst.Acceptor(syms) expected.add_arc(0, 1, 'dial') expected.add_arc(1, 2, 'google') expected.add_arc(1, 3, fst.EPSILON) expected.add_arc(3, 5, 'michael') expected.add_arc(3, 6, fst.EPSILON) expected.add_arc(6, 9, 'johan') expected.add_arc(9, 5, fst.EPSILON) expected.add_arc(5, 7, 'riley') expected.add_arc(5, 8, fst.EPSILON) expected.add_arc(8, 10, 'schalkwyk') expected.add_arc(10, 7, fst.EPSILON) expected.add_arc(7, 2, fst.EPSILON) expected.add_arc(2, 4, 'please') expected[4].final = True expected.remove_epsilon() eq_(result, expected)
def sigma (syms): "This creates a two-state acceptor that accepts any one letter in isyms" thisfst=fst.Acceptor(syms=syms); for sym,val in syms.items(): if (val > 0): thisfst.add_arc(0,1,sym) thisfst[1].final=True; return thisfst
def test_shortest_distance(): t = fst.Acceptor() t.add_arc(0, 1, 'a', 3) t.add_arc(1, 1, 'b', 2) t.add_arc(1, 3, 'c', 4) t.add_arc(0, 2, 'd', 5) t.add_arc(2, 3, 'f', 4) t[3].final = 3 eq_([float(v) for v in t.shortest_distance()], [0, 3, 5, 7]) eq_([float(v) for v in t.shortest_distance(True)], [10, 7, 7, 3])
def test_paths(): t = fst.Acceptor() t.add_arc(0, 1, 'a') t.add_arc(1, 2, 'b') t.add_arc(0, 2, 'c') t.add_arc(2, 3, 'd') t.add_arc(3, 4, 'e') t[2].final = True t[4].final = True words = set(''.join(t.isyms.find(arc.ilabel) for arc in path) for path in t.paths()) eq_(words, set(('ab', 'c', 'abde', 'cde')))
def calculate_unigram_constraint (counts,syms=None): probs={} total=0 thisfst=fst.Acceptor(syms=syms) for word in counts: total+=counts[word] for word in counts: probs[word]=(counts[word]+0.0)/(total+0.0) if (probs[word]<=0.00000000000000001): probs[word]=0.00000000000000001 thisfst.add_arc(0,0,word,-math.log(probs[word])) thisfst[0].final=True return thisfst
def test_simple(): t = fst.Transducer() for i, (ic, oc) in enumerate(zip('hello', 'olleh')): t.add_arc(i, i + 1, ic, oc) t[i + 1].final = True eq_(len(t), 6) ok_(t[5].final) a = fst.Acceptor() for i, c in enumerate('hello'): a.add_arc(i, i + 1, c) a[i + 1].final = True eq_(len(a), 6) ok_(a[5].final)
def test_randgen(): t = fst.Acceptor() t.add_arc(0, 1, 'a', 0.5) t.add_arc(1, 2, 'b', 0.5) t.add_arc(0, 2, 'ab', 1.0) t.add_arc(2, 3, 'c') t.add_arc(3, 4, 'd') t.add_arc(2, 4, 'cd') t[4].final = True r = t.uniform_generate() # check that r \in t eq_(r & t.remove_weights(), r) r = t.logprob_generate() # check that r \in t eq_(r & t.remove_weights(), r)
def bfs(stateid, allLM, sym): """ This part extract and constuct the branch fst in a bds apprach to later extract its paths easily. It numbers states from zero id regardless of state id in original tree. INPUT: state to begin create branch from allLM the big tree fst sym symbol system of allLM OUTPUT: branch, an fst of desired branch """ branch = fst.Acceptor(syms=sym) stack = [stateid] st_encod = {} i = 0 while stack: state = allLM[stack[0]] if i == 0: sid = i i += 1 else: sid = st_encod[state.stateid] for arc in state.arcs: nextst = arc.nextstate try: # encode states beginning from id 0 st_encod[nextst] except: st_encod[nextst] = i i += 1 stack.append(nextst) label = sym.find(arc.ilabel) w = arc.weight branch.add_arc(sid, st_encod[nextst], label, w) stack = stack[1:] branch[i - 1].final = True return branch
def fst_alter_sent(self, words, numalts=5, cutoff=0): # with NLTK we could do POS tagging here # pos = nltk.pos_tag(text) # instead, we just make everything NN pos = [(w, 'NN') for w in words] altfst = fst.Acceptor(syms=self.lmfst.isyms) for idx, (word, tag) in enumerate(pos): # add the word to the lattice if word in altfst.isyms: altfst.add_arc(idx, idx + 1, word, 0) else: altfst.add_arc(idx, idx + 1, "<unk>", 0) # add word alternatives to the lattice if ( tag.startswith('NN') or \ tag.startswith('JJ') or tag.startswith('RB') or \ tag.startswith('VB') ) and \ word not in ['have', 'has', 'had', 'is', 'are', 'am', \ 'was', 'were', 'be', '.', ',', ':', '?', \ '!', '-', '--', 'of'] and \ not word.startswith("'"): nearlist = self.vecs.near(word, 5) # check if there are any neighbors at all if nearlist == None: continue # add each neighbor to the lattice for widx, (dist, w) in enumerate(nearlist): if dist > 0.1 and w in altfst.isyms and w != word: altfst.add_arc(idx, idx + 1, w, (math.log(dist) * -1) / 1000) # mark the final state in the FST altfst[len(words)].final = True # rescore the lattice using the language model scoredfst = self.lmfst.compose(altfst) # get best paths in the rescored lattice bestpaths = scoredfst.shortest_path(numalts) bestpaths.remove_epsilon() altstrings = {} # get the strings and weights from the best paths for i, path in enumerate(bestpaths.paths()): path_string = ' '.join( bestpaths.isyms.find(arc.ilabel) for arc in path) path_weight = functools.reduce(operator.mul, (arc.weight for arc in path)) if not path_string in altstrings: altstrings[path_string] = path_weight # print('Altstrings:') # print(altstrings) # sort strings by weight scoredstrings = [] for sent in altstrings: score = float(("%s" % altstrings[sent]).split('(')[1].strip(')')) scoredstrings.append((score, sent)) scoredstrings = self.sent_rescore(scoredstrings) scoredstrings.sort() if len(scoredstrings) > numalts: scoredstrings = scoredstring[:numalts] if cutoff > 0: scoredstrings = [s for s in scoredstrings if s[0] <= cutoff] # print('Scoredstrings:') # print(scoredstrings) return scoredstrings
-math.log(A_full_table[i][j])) for tag in full_tag_set: i = dict_tags[tag] HMM_tagger.add_arc(num_temp + i, 2 * num_temp + 1, eps, eps, -math.log(A_full_table[i][num_temp + 1])) HMM_tagger[num_temp * 2 + 1].final = True ########################################################### # test part # you can run the above part first, and run the test part for many times, and change the index at each time. # use sents in test_set and output the sentences, right ans and my ans. ########################################################### test = fst.Acceptor(HMM_tagger.isyms) num_temp = 0 # index of testing sentence index = 0 right_ans = [] for (word, tag) in test_set[index]: test.add_arc(num_temp, num_temp + 1, word) num_temp = num_temp + 1 right_ans.append(tag) test[num_temp].final = True test = ((test >> HMM_tagger).shortest_path()) test.project_output()
rep_text = [] file_directory = "../Data_Engineer_ASAPP_Challenge/sample_conversations.json" json_data = open(file_directory).read() data = json.loads(json_data) for msg in data["Issues"]: for cstmr in msg["Messages"]: if cstmr["IsFromCustomer"] is False: rep_text.append(cstmr["Text"].lower()) rep_text = sorted(rep_text) # extract words for syms # build prefix tree #rep_text = ["I'm", "I'm at", "I want", "what are","what he", "where?"] syms = fst.SymbolTable() prompt_main = fst.Acceptor(syms) for pmt in rep_text: prompt = fst.Acceptor(syms) for i, ch in enumerate(pmt): prompt.add_arc(i, i + 1, ch, 1) prompt[i + 1].final = True prompt_main.set_union(prompt) prompt_main.remove_epsilon() prompt_main = prompt_main.determinize() prompt_main.minimize() prompt_main = prompt_main.push_weights(final=False) # write to dot and draw names = ["prfx_tree"]
def create_acceptor(): return fst.Acceptor()