-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze.py
116 lines (94 loc) · 4.39 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import FSM
import util
vocabulary = ['panic', 'picnic', 'ace', 'pack', 'pace', 'traffic', 'lilac', 'ice', 'spruce', 'frolic']
suffixes = ['', '+ed', '+ing', '+s']
def buildSourceModel(vocabulary, suffixes):
# we want a language model that accepts anything of the form
# * w
# * w+s
fsa = FSM.FSM()
fsa.setInitialState('start')
fsa.setFinalState('end')
for word in vocabulary:
fsa.addEdgeSequence('start', '1', word)
fsa.addEdge('1', 'end', None) # if there is no suffix
for suffix in suffixes:
if len(suffix) > 0: # skip if no suffix
fsa.addEdgeSequence('1', 'end', suffix)
return fsa
def buildChannelModel():
# this should have exactly the same rules as englishMorph.py!
fst = FSM.FSM(isTransducer=True)
fst.setInitialState('start')
fst.setFinalState('end')
# we can always get from start to end by consuming non-+
# characters... to implement this, we transition to a safe state,
# then consume a bunch of stuff
fst.addEdge('start', 'safe', '.', '.')
fst.addEdge('safe', 'safe', '.', '.')
fst.addEdge('safe', 'safe2', '+', None)
fst.addEdge('safe2', 'safe2', '.', '.')
fst.addEdge('safe', 'end', None, None)
fst.addEdge('safe2', 'end', None, None)
# implementation of rule 1
fst.addEdge('start' , 'rule1' , None, None) # epsilon transition
fst.addEdge('rule1' , 'rule1' , '.', '.') # accept any character and copy it
fst.addEdge('rule1' , 'rule1b', 'e', None) # remove the e
fst.addEdge('rule1b', 'rule1c', '+', None) # remove the +
fst.addEdge('rule1c', 'rule1d', 'e', 'e') # copy an e ...
fst.addEdge('rule1c', 'rule1d', 'i', 'i') # ... or an i
fst.addEdge('rule1d', 'rule1d', '.', '.') # and then copy the remainder
fst.addEdge('rule1d', 'end' , None, None) # we're done
# implementation of rule 2
fst.addEdge('start' , 'rule2' , '.', '.') # we need to actually consume something
fst.addEdge('rule2' , 'rule2' , '.', '.') # accept any character and copy it
fst.addEdge('rule2' , 'rule2b', 'e', 'e') # keep the e
fst.addEdge('rule2b', 'rule2c', '+', None) # remove the +
for i in range(ord('a'), ord('z')):
c = chr(i)
if c == 'e' or c == 'i':
continue
fst.addEdge('rule2c', 'rule2d', c, c) # keep anything except e or i
fst.addEdge('rule2d', 'rule2d', '.', '.') # keep the rest
fst.addEdge('rule2d', 'end' , None, None) # we're done
# implementation of rule 3
fst.addEdge('start' , 'rule3' , '.', '.') # not sure if eps or not
fst.addEdge('rule3' , 'rule3' , '.', '.')
for c in ['a','e','i','o','u']:
fst.addEdge('rule3', 'rule3b', c, c)
fst.addEdge('rule3b', 'rule3c', 'c', 'c')
fst.addEdge('rule3c', 'rule3d', '+', 'k')
fst.addEdge('rule3d', 'rule3e', 'e', 'e')
fst.addEdge('rule3d', 'rule3e', 'i', 'i')
fst.addEdge('rule3e', 'rule3e', '.', '.')
fst.addEdge('rule3e', 'end' , None, None)
return fst
def simpleTest():
fsa = buildSourceModel(vocabulary, suffixes)
fst = buildChannelModel()
print "==== Trying source model on strings 'ace+ed' ===="
#was [fsa]
output = FSM.runFST([fsa], ["ace+ed"])
print "==== Result: ", str(output), " ===="
print "==== Trying source model on strings 'panic+ing' ===="
#was [fsa]
output = FSM.runFST([fsa], ["panic+ing"])
print "==== Result: ", str(output), " ===="
#for w in vocabulary:
#for s in suffixes:
#analysis = w + s
#output = FSM.runFST([fsa],[analysis])[0][0]
#if output != analysis:
#print "FAILED"
#sys.exit(1)
print "==== Generating random paths for 'aced', using only channel model ===="
output = FSM.runFST([fst], ["aced"], maxNumPaths=10, randomPaths=True)
print "==== Result: ", str(output), " ===="
print "==== Generating random paths for 'aced', using only channel model ===="
output = FSM.runFST([fst], ["panicked"], maxNumPaths=10, randomPaths=True)
print "==== Result: ", str(output), " ===="
print "==== Disambiguating a few phrases: aced, panicked, paniced, sprucing ===="
output = FSM.runFST([fsa,fst], ["aced", "paniced", "panicked", "sprucing"])
print "==== Result: ", str(output), " ===="
if __name__ == '__main__':
simpleTest()