-
Notifications
You must be signed in to change notification settings - Fork 0
/
bottom_up_chart_parser.py
executable file
·339 lines (292 loc) · 13.5 KB
/
bottom_up_chart_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#!/usr/bin/env python
from chart import Chart
from queue import Queue, BestFirstQueue, AltSearchQueue
from edge import Edge
from production_rule import ProductionRule
from grammar import Grammar
from parse_exception import ParseException
from queue_exception import QueueException
from bisect import bisect
class BottomUpChartParser:
grammar = None # Grammar object that includes lexicon and
# production rules
queue = None # Queue object on which new edges are stacked
chart = None # Chart object in which edges are stored for the
# final parse generation
sentence_length = 0
will_print_chart = True # Set to false if you want to deactivate printing of the found parses
def __init__(self, grammar):
self.grammar = Grammar(grammar)
def parse(self, sentence, number_of_parses, strategy):
'''
Parse the input sentence
This is the central method to be called from outside.
'''
### Preprocessing ###
# Tokenize input sentence
tokens = self.tokenize(sentence)
self.sentence_length = len(tokens)
# Check for unknown tokens
unknown_words = self.get_unknown_words(tokens)
if unknown_words:
# TODO: Run fallback solutions to fix unknown words, else
# raise exception
raise ParseException("Sentence contains unknown words (%s). Please try again!" % ', '.join(unknown_words))
### Main steps ###
# (1) Initialize empty chart and queue
self.initialize_chart()
self.initialize_queue(strategy)
# (2) For every token, create a complete edge and push it to
# the queue
self.init_rule(tokens)
# Iteration counter for evaluation purposes
iters = 0
# (3) Repeat until no more edges are added
# or sufficient number of parses has been found:
while not self.queue.is_empty() and not self.enough_parses_found(number_of_parses):
iters = iters + 1
# (3.1) Add next element on queue to the chart
edge = self.queue.get_next_edge()
self.chart.add_edge(edge)
# (3.2) If input edge is complete,
# apply predict rule and fundamental rule.
# If input edge is incomplete,
# apply fundamental rule only
if edge.is_complete():
self.predict_rule(edge)
self.fundamental_rule(edge)
# (3.3) For alt search strategy, run search rule
# if input edge is a complete parse
# or last element of priority queue
if strategy == 'altsearch':
if ( ( (not self.queue.is_priority_active()) # Case 1: Complete parse was added to chart
and edge.get_prod_rule().get_lhs() == 'S'
and edge.is_complete()
and edge.get_start() == 0
and edge.get_end() == self.sentence_length )
or (self.queue.is_priority_active() # Case 2: Priority queue emptied
and self.queue.is_priority_empty() ) ):
self.search_rule(edge)
# 4) Display generated parses
s_edges = self.chart.get_s_edges()
print '%s parses found after %s iterations:' % (len(s_edges),iters)
if self.will_print_chart :
self.display_parses()
else:
for s_edge in s_edges:
print 'Found s-edge: %s' % s_edge
def tokenize(self, sentence):
'''
Separate a sentence into a list of tokens and return the list.
Currently this simply splits at each whitespace character with no
special preprocessing
'''
return sentence.split()
def get_unknown_words(self, tokens):
'''
Check list of tokens for unknown words by consulting the
lexicon and return them
'''
lexicon = self.grammar.get_lexicon()
unknown_words = [token for token in tokens if token not in lexicon]
return unknown_words
def initialize_chart(self):
'''
Initialize chart
Size of chart will be sentence_length+1 in both dimensions
'''
self.chart = Chart(self.sentence_length)
def initialize_queue(self, strategy):
'''
Initialize queue according to the parsing strategy chosen by
the user
'''
if strategy == 'fifo':
self.queue = Queue()
elif strategy == 'bestfirst':
self.queue = BestFirstQueue()
elif strategy == 'altsearch':
self.queue = AltSearchQueue(self.sentence_length+1)
else:
raise QueueException('Invalid strategy (%s). Please try again and choose a strategy from the following set: {fifo, bestfirst, altsearch}' % strategy)
def init_rule(self, tokens):
'''
Generate initial edges for all given tokens and add them to
the queue
Formal definition:
For every word w_i add the edge [w_i -> . , (i, i+1)]
'''
node = -1 # Position between tokens of sentence
# (0 is start of sentence)
for token in tokens:
node += 1
rule = ProductionRule(token, [], 1.0)
edge = Edge(node, node+1, rule, 0, [])
self.queue.add_edge(edge)
def enough_parses_found(self, number_of_parses):
'''
Check if enough parses have been found for the input sentence
Return True if the number of complete S edges that the chart
contains is >= the number of parses that the user wants, else
False
'''
return False if (number_of_parses == -1 or len(self.chart.get_s_edges()) < number_of_parses) else True
def predict_rule(self, complete_edge):
'''
If the LHS of a complete edge can be the first RHS element of
a production rule, create a self-loop edge with that rule and
push it to the queue
Input: Complete edge
Push to queue: Incomplete self-loop edges
Formal definition:
For each complete edge [A -> alpha . , (i, j)]
and each production rule B -> A beta,
add the self-loop edge [B -> . A beta , (i, i)]
'''
start = complete_edge.get_start()
lhs = complete_edge.get_prod_rule().get_lhs()
parent_rules = self.grammar.get_possible_parent_rules(lhs)
for parent_rule in parent_rules:
new_edge = Edge(start, start, parent_rule, 0, [])
if not self.queue.has_edge(new_edge) and not self.chart.has_edge(new_edge):
self.queue.add_edge(new_edge)
def fundamental_rule(self, input_edge):
'''
If an incomplete edge can be advanced by a complete edge,
create a new edge with the advanced dot.
Create new edges (which can be complete or incomplete) by
"advancing the dot", i.e. by matching incomplete edges with
appropriate complete ones:
(1) If the input edge is incomplete, find all complete edges
- whose start node equals the end node of the input edge
- whose LHS matches the RHS element
that the input edge is currently looking for.
If the input edge is complete, find all incomplete edges
- whose end node equals the start node of the input edge
- whose dot can be advanced by pairing them with the input
edge.
(2) From every pairing, create a new edge with the dot
advanced over the RHS element that has just been found.
(3) Push that edge to the queue IFF it does not exist already,
i.e. if it has not been added to the chart or the queue
before. This constraint keeps the parser from entering an
infinite loop when using left-recursive grammar rules.
Input: Single edge
Push to queue: Complete and incomplete edges
Formal definition:
If the chart contains the edges [A -> alpha . B beta, (i, j)]
and [B -> gamma . , (j, k)]
then add a new edge [A -> alpha B . beta, (i, k)].
'''
if input_edge.is_complete():
j = input_edge.get_start()
incomplete_edges = [edge for edge \
in self.chart.get_edges_ending_at(j) \
if not edge.is_complete()]
complete_edges = [input_edge]
else:
j = input_edge.get_end()
incomplete_edges = [input_edge]
complete_edges = [edge for edge \
in self.chart.get_edges_starting_at(j) \
if edge.is_complete()]
### New Edges ###
for incomp_edge in incomplete_edges:
# Prepare info from incomplete edge that is necessary to ...
prod_rule = incomp_edge.get_prod_rule()
dot = incomp_edge.get_dot()
next_missing_dtr = prod_rule.get_rhs_element(dot)
for comp_edge in complete_edges:
# ... check for compatibility with complete edges:
if next_missing_dtr == comp_edge.get_prod_rule().get_lhs():
# Prepare additional info from incomplete edge
i = incomp_edge.get_start()
known_dtrs = incomp_edge.get_known_dtrs()
# Prepare info from complete edge
k = comp_edge.get_end()
# Combine info from both edges,
# and use it to create new edge
new_dtrs = known_dtrs + [comp_edge]
new_edge = Edge(i, k, prod_rule, dot+1, new_dtrs)
# Add new edge to queue
if not self.queue.has_edge(new_edge) and not self.chart.has_edge(new_edge):
self.queue.add_edge(new_edge)
def search_rule(self, s_edge):
'Scans queue for priority edges. See project report for detailed explanation'
self.queue.activate_priority_queue()
# Check for further complete s-edges on queue
new_s_edge = self.queue.get_next_particular_edge('S', 0, self.sentence_length)
if not new_s_edge == None: # Further s-edge was found
self.queue.add_edge(new_s_edge)
else: # No further s-edge remained on queue
s_edges = self.chart.get_s_edges()
# Sort s-edges by likelihood
sorted_edges = []
for s_edge in s_edges:
pos = bisect([edge.get_prob() for edge in sorted_edges], s_edge)
sorted_edges.insert(pos, s_edge)
# Check s-edges until alternative parses are found
while len(s_edges) > 0 and self.queue.is_empty:
s_edge = s_edges.pop()
dtrs = s_edge.get_known_dtrs()
# Iterate through depths until alternatives are found
# or tree is exhausted
while len(dtrs) > 0:
# print "---"
for dtr in dtrs:
lhs = dtr.get_prod_rule().get_lhs()
start = dtr.get_start()
end = dtr.get_end()
alt_dtr = self.queue.get_next_particular_edge(lhs, start, end)
while not alt_dtr == None:
self.queue.add_edge(alt_dtr)
alt_dtr = self.queue.get_next_particular_edge(lhs, start, end)
if self.queue.is_empty:
# If no alt edge was found on this level,
# check all elements of next lower level.
for dtr in dtrs:
mthrs = dtrs
dtrs = []
for mthr in mthrs:
dtrs.extend(mthr.get_known_dtrs())
def display_parses(self):
'''
Display parse trees for all successful parses
'''
s_edges = self.chart.get_s_edges()
if len(s_edges) == 0:
raise ParseException("No parse could be found.")
for s_edge in s_edges:
parse_string = self.build_parse_string_from_edge(s_edge, 'S')
print self.add_indentation_to_parse_string(parse_string) + '\t' + str(s_edge.get_prob())
def build_parse_string_from_edge(self, edge, root):
'''
Recursively work your way down through the known daughters of
the input edge; return a bracketed structure representing the
parse tree.
In order to obtain a complete structure, this
method needs to be called with a string representing
the appropriate tree root (as the second argument)
'''
if not edge.get_known_dtrs() == []:
for dtr in edge.get_known_dtrs():
root += ' [ ' + dtr.get_prod_rule().get_lhs() + self.build_parse_string_from_edge(dtr, '') + ' ]'
return root
def add_indentation_to_parse_string(self, parse_string):
'''
Convert flat string representation of parse to appropriately
indented structure
'''
parse_string = '[ ' + parse_string + ' ]'
indented_string = ''
level = -1
for char in parse_string:
if char == '[':
level += 1
indented_string += '\n' + '\t'*level + char
elif char == ']':
level -= 1
indented_string += char
else:
indented_string += char
return indented_string