Esempio n. 1
0
def test_replace():
    syms = fst.SymbolTable()

    a1 = fst.Acceptor(syms)
    a1.add_arc(0, 1, 'dial')
    a1.add_arc(1, 2, 'google')
    a1.add_arc(1, 2, '$name')
    a1.add_arc(2, 3, 'please')
    a1[3].final = True

    a2 = fst.Acceptor(syms)
    a2.add_arc(0, 1, 'michael')
    a2.add_arc(1, 2, 'riley')
    a2.add_arc(0, 1, '$firstname')
    a2.add_arc(1, 2, '$lastname')
    a2[2].final = True

    a3 = fst.Acceptor(syms)
    a3.add_arc(0, 1, 'johan')
    a3[1].final = True

    a4 = fst.Acceptor(syms)
    a4.add_arc(0, 1, 'schalkwyk')
    a4[1].final = True

    result = a1.replace({
        '$name': a2,
        '$firstname': a3,
        '$lastname': a4
    },
                        epsilon=True)
    result.remove_epsilon()

    expected = fst.Acceptor(syms)
    expected.add_arc(0, 1, 'dial')
    expected.add_arc(1, 2, 'google')
    expected.add_arc(1, 3, fst.EPSILON)
    expected.add_arc(3, 5, 'michael')
    expected.add_arc(3, 6, fst.EPSILON)
    expected.add_arc(6, 9, 'johan')
    expected.add_arc(9, 5, fst.EPSILON)
    expected.add_arc(5, 7, 'riley')
    expected.add_arc(5, 8, fst.EPSILON)
    expected.add_arc(8, 10, 'schalkwyk')
    expected.add_arc(10, 7, fst.EPSILON)
    expected.add_arc(7, 2, fst.EPSILON)
    expected.add_arc(2, 4, 'please')
    expected[4].final = True
    expected.remove_epsilon()

    eq_(result, expected)
Esempio n. 2
0
def sigma (syms):
    "This creates a two-state acceptor that accepts any one letter in isyms"
    thisfst=fst.Acceptor(syms=syms);
    for sym,val in syms.items():
        if (val > 0):
            thisfst.add_arc(0,1,sym)
    thisfst[1].final=True;
    return thisfst
Esempio n. 3
0
def test_shortest_distance():
    t = fst.Acceptor()
    t.add_arc(0, 1, 'a', 3)
    t.add_arc(1, 1, 'b', 2)
    t.add_arc(1, 3, 'c', 4)
    t.add_arc(0, 2, 'd', 5)
    t.add_arc(2, 3, 'f', 4)
    t[3].final = 3

    eq_([float(v) for v in t.shortest_distance()], [0, 3, 5, 7])
    eq_([float(v) for v in t.shortest_distance(True)], [10, 7, 7, 3])
Esempio n. 4
0
def test_paths():
    t = fst.Acceptor()
    t.add_arc(0, 1, 'a')
    t.add_arc(1, 2, 'b')
    t.add_arc(0, 2, 'c')
    t.add_arc(2, 3, 'd')
    t.add_arc(3, 4, 'e')
    t[2].final = True
    t[4].final = True

    words = set(''.join(t.isyms.find(arc.ilabel) for arc in path)
                for path in t.paths())
    eq_(words, set(('ab', 'c', 'abde', 'cde')))
Esempio n. 5
0
def calculate_unigram_constraint (counts,syms=None):
    probs={}
    total=0
    thisfst=fst.Acceptor(syms=syms)
    for word in counts:
        total+=counts[word]
    for word in counts:
        probs[word]=(counts[word]+0.0)/(total+0.0)
        if (probs[word]<=0.00000000000000001):
            probs[word]=0.00000000000000001
        thisfst.add_arc(0,0,word,-math.log(probs[word]))
    thisfst[0].final=True
    return thisfst
Esempio n. 6
0
def test_simple():
    t = fst.Transducer()
    for i, (ic, oc) in enumerate(zip('hello', 'olleh')):
        t.add_arc(i, i + 1, ic, oc)
    t[i + 1].final = True
    eq_(len(t), 6)
    ok_(t[5].final)

    a = fst.Acceptor()
    for i, c in enumerate('hello'):
        a.add_arc(i, i + 1, c)
    a[i + 1].final = True
    eq_(len(a), 6)
    ok_(a[5].final)
Esempio n. 7
0
def test_randgen():
    t = fst.Acceptor()
    t.add_arc(0, 1, 'a', 0.5)
    t.add_arc(1, 2, 'b', 0.5)
    t.add_arc(0, 2, 'ab', 1.0)
    t.add_arc(2, 3, 'c')
    t.add_arc(3, 4, 'd')
    t.add_arc(2, 4, 'cd')
    t[4].final = True
    r = t.uniform_generate()
    # check that r \in t
    eq_(r & t.remove_weights(), r)
    r = t.logprob_generate()
    # check that r \in t
    eq_(r & t.remove_weights(), r)
Esempio n. 8
0
def bfs(stateid, allLM, sym):
    """
    This part extract and constuct the branch fst 
    in a bds apprach to later extract its paths easily.
    It numbers states from zero id regardless of state id
    in original tree.
    INPUT:
       state to begin create branch from
       allLM the big tree fst
       sym symbol system of allLM
    OUTPUT:
        branch, an fst of desired branch

    """

    branch = fst.Acceptor(syms=sym)
    stack = [stateid]
    st_encod = {}
    i = 0
    while stack:

        state = allLM[stack[0]]
        if i == 0:
            sid = i
            i += 1
        else:
            sid = st_encod[state.stateid]

        for arc in state.arcs:

            nextst = arc.nextstate

            try:  # encode states beginning from id 0
                st_encod[nextst]
            except:
                st_encod[nextst] = i
                i += 1

            stack.append(nextst)
            label = sym.find(arc.ilabel)
            w = arc.weight
            branch.add_arc(sid, st_encod[nextst], label, w)
        stack = stack[1:]
    branch[i - 1].final = True

    return branch
Esempio n. 9
0
    def fst_alter_sent(self, words, numalts=5, cutoff=0):
        # with NLTK we could do POS tagging here
        # pos = nltk.pos_tag(text)

        # instead, we just make everything NN
        pos = [(w, 'NN') for w in words]

        altfst = fst.Acceptor(syms=self.lmfst.isyms)

        for idx, (word, tag) in enumerate(pos):
            # add the word to the lattice
            if word in altfst.isyms:
                altfst.add_arc(idx, idx + 1, word, 0)
            else:
                altfst.add_arc(idx, idx + 1, "<unk>", 0)

            # add word alternatives to the lattice
            if ( tag.startswith('NN') or \
                 tag.startswith('JJ') or tag.startswith('RB') or \
                 tag.startswith('VB') ) and \
                word not in ['have', 'has', 'had', 'is', 'are', 'am', \
                             'was', 'were', 'be', '.', ',', ':', '?', \
                             '!', '-', '--', 'of'] and \
                not word.startswith("'"):
                nearlist = self.vecs.near(word, 5)

                # check if there are any neighbors at all
                if nearlist == None:
                    continue

                # add each neighbor to the lattice
                for widx, (dist, w) in enumerate(nearlist):
                    if dist > 0.1 and w in altfst.isyms and w != word:
                        altfst.add_arc(idx, idx + 1, w,
                                       (math.log(dist) * -1) / 1000)

        # mark the final state in the FST
        altfst[len(words)].final = True

        # rescore the lattice using the language model
        scoredfst = self.lmfst.compose(altfst)

        # get best paths in the rescored lattice
        bestpaths = scoredfst.shortest_path(numalts)
        bestpaths.remove_epsilon()

        altstrings = {}

        # get the strings and weights from the best paths
        for i, path in enumerate(bestpaths.paths()):
            path_string = ' '.join(
                bestpaths.isyms.find(arc.ilabel) for arc in path)
            path_weight = functools.reduce(operator.mul,
                                           (arc.weight for arc in path))
            if not path_string in altstrings:
                altstrings[path_string] = path_weight

        # print('Altstrings:')
        # print(altstrings)

        # sort strings by weight
        scoredstrings = []
        for sent in altstrings:
            score = float(("%s" % altstrings[sent]).split('(')[1].strip(')'))
            scoredstrings.append((score, sent))

        scoredstrings = self.sent_rescore(scoredstrings)
        scoredstrings.sort()

        if len(scoredstrings) > numalts:
            scoredstrings = scoredstring[:numalts]

        if cutoff > 0:
            scoredstrings = [s for s in scoredstrings if s[0] <= cutoff]

        # print('Scoredstrings:')
        # print(scoredstrings)
        return scoredstrings
Esempio n. 10
0
                           -math.log(A_full_table[i][j]))

for tag in full_tag_set:
    i = dict_tags[tag]
    HMM_tagger.add_arc(num_temp + i, 2 * num_temp + 1, eps, eps,
                       -math.log(A_full_table[i][num_temp + 1]))

HMM_tagger[num_temp * 2 + 1].final = True

###########################################################
# test part
# you can run the above part first, and run the test part for many times, and change the index at each time.
# use sents in test_set and output the sentences, right ans and my ans.
###########################################################

test = fst.Acceptor(HMM_tagger.isyms)
num_temp = 0

# index of testing sentence
index = 0

right_ans = []

for (word, tag) in test_set[index]:
    test.add_arc(num_temp, num_temp + 1, word)
    num_temp = num_temp + 1
    right_ans.append(tag)
test[num_temp].final = True

test = ((test >> HMM_tagger).shortest_path())
test.project_output()
Esempio n. 11
0
rep_text = []
file_directory = "../Data_Engineer_ASAPP_Challenge/sample_conversations.json"
json_data = open(file_directory).read()
data = json.loads(json_data)
for msg in data["Issues"]:
    for cstmr in msg["Messages"]:
        if cstmr["IsFromCustomer"] is False:
            rep_text.append(cstmr["Text"].lower())

rep_text = sorted(rep_text)
# extract words for syms

# build prefix tree
#rep_text = ["I'm", "I'm at", "I want", "what are","what he", "where?"]
syms = fst.SymbolTable()
prompt_main = fst.Acceptor(syms)

for pmt in rep_text:
    prompt = fst.Acceptor(syms)
    for i, ch in enumerate(pmt):
        prompt.add_arc(i, i + 1, ch, 1)
    prompt[i + 1].final = True

    prompt_main.set_union(prompt)
    prompt_main.remove_epsilon()
    prompt_main = prompt_main.determinize()
    prompt_main.minimize()
    prompt_main = prompt_main.push_weights(final=False)

# write to dot and draw
names = ["prfx_tree"]
Esempio n. 12
0
 def create_acceptor():
     return fst.Acceptor()