/
chunker.py
222 lines (177 loc) · 6.59 KB
/
chunker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class Rule:
from nltk.tree import Tree
def __init__(self, contents, type):
self.contents = contents
self.type = type
###################################################
def rule_to_children(self, tree, children):
#create a string, total, based on contents of children values (different whether subtree or leaf)
current = ""
total=""
ruleContents = self.contents
for child in children:
if type(tree[child]) is Tree: #subtree
current = tree[child].node
elif type(tree[child]) is tuple: #leaf with special info
current = tree[child][-1] #-1 gets us POS tag
else: #leaf without any special info
current = tree[child]
total+=current+" "
#now see if total is governed by rule, and if so, where and how many times
#list_positions is a list of tuple values of the start and end positions
#of the rule being instantiated by string total
import re
pattern = re.compile(ruleContents)
list_pattern = re.split(pattern, total)
tuple_position = ()
len_rule = len(ruleContents.split())
if len(list_pattern)>1:
start = len(list_pattern[0].split())
tuple_position = (start, start+len_rule)
else:
tuple_position = (-1,-1)
return tuple_position
####################################################
def find_brothers(self, children, parent):
#find_brothers creates a dictionary where the key is the parent/root
#and the value is the list of its children, who are "brothers"
allBrothers = []
tempBrothers = []
dictBrothers = {} #key -> parent; value -> list of children
#set up parental info for each child of tree
#so that: parent[child] = child's parent
if () in children:
children.remove(()) #remove () since we do not need root as it has no parents or brothers
for child in children:
parent[child] = child[0:-1] #parent info is everything in
#tuple except last value in
#treepositions() tuples
for child in children:
if not child in allBrothers: #ensures no redundancy in dictBrothers
tempBrothers = []
allBrothers.append(child) #all children already considered
tempBrothers.append(child)
for otherChild in children:
if not otherChild in allBrothers:
if parent[otherChild]==parent[child]:
tempBrothers.append(otherChild)
allBrothers.append(otherChild)
if len(tempBrothers) > 1:
dictBrothers[parent[child]] = tempBrothers
return dictBrothers
##########################################################
def chunk(self, tree, rule, depth):
ruleContents = rule.contents
ruleName = rule.type
if depth==0: #maximum recursion set by depth
return tree
children = tree.treepositions('postorder') #get tuples for all locations in tree
string = ""
parent = {}
subtrees = {} #key->new subtree to add to tree; value->location to place in treepositions()
dictBrothers = rule.find_brothers(children, parent) # returns a dict. of those children in the tree who have the same parent,
# such that a rule MIGHT apply to them
if dictBrothers == dict(): # no possible application of rule
return tree
#now we have dictBrothers which is a list of all children who have the same parent,
#we check to see which list of brothers corresponds to ruleContents
#such that tree will need to be altered at that location
for child in children:
# look for a child in tree for whom it both (1) has brothers and (2) rule applies (rule_to_children(tree, brothers, rule))
# otherwise, just "continue"
if not parent[child] in dictBrothers:
continue
tempBrothers = dictBrothers[parent[child]]
tuple = self.rule_to_children(tree, tempBrothers)
if tuple == (-1,-1):
continue
#found a rule applies for certain children
#now set up new tree
#and re-arrange tree to fit
#then recursively call chunker with depth-1
start = tuple[0]
end = tuple[1]
newTree = Tree("("+ruleName+")")
for i in range(end-start): #set up new tree
newChild = tempBrothers[i+start]
ruleList = ruleContents.split()
typeOf = type(tree[newChild])
if typeOf is Tree:
modifiedName = "<"+tree[newChild].node+">"
tree[newChild].node = modifiedName
else:
#ruleList = ruleContents.split()
#subst="-->"
#for i in range(len(rule)):
#subst+="<"+ruleList[i]+"> " #add this so we know how tree was derived
newTuple = (tree[newChild][0], "<"+str(tree[newChild][-1])+">")
tree[newChild] = newTuple
newTree.append(tree[newChild])
tree[tempBrothers[start]] = newTree #attach new tree at left-most child (start)
#then remove old children except for
#0/start, which is the new tree
for i in range(end-start):
if i != 0:
tree[tempBrothers[i+start]] = "REMOVE"
while "REMOVE" in tree:
tree.remove("REMOVE")
for subtree in tree.subtrees():
if "REMOVE" in subtree:
subtree.remove("REMOVE")
#now recursively chunk if there are more brothers
#to whom rule applies
if len(dictBrothers)>1 or len(dictBrothers[parent[child]])>len(ruleContents.split()):
return self.chunk(tree, rule, depth-1)
else:
return tree
#found no children for whom rule applies, so just return tree
return tree
########END OF RULE CLASS#################
###########################################
from nltk.tree import Tree
from nltk.corpus import treebank_chunk
def printTree(tree, file, tabs=0):
file.write('\n')
for i in range(tabs):
file.write('\t')
file.write("("+tree.node)
for node in tree:
if type(node) is Tree:
printTree(node, file, tabs+1)
else:
for i in range(tabs+1):
file.write('\t')
file.write("(")
file.write(str(node[0]))
file.write(" ")
file.write(str(node[1]))
file.write(")")
for i in range(tabs+1):
file.write('\t')
file.write(')\n')
#########################################
def main():
files = ["wsj_0156.pos",
"wsj_0160.pos",
"wsj_0163.pos",
"wsj_0165.pos",
"wsj_0167.pos",
"wsj_0170.pos",
"wsj_0175.pos",
"wsj_0187.pos",
"wsj_0195.pos",
"wsj_0196.pos"]
test_trees = treebank_chunk.chunked_sents(files)
NP_rules = ["NP , NP CC NP", "NP CC NP", "NP IN NP", "NP TO NP", "NP NP", "NP NN NP", "NP , NP ,", "RB VBN NP"]
#NP_rules = ["DT NN", "JJ NN", "DT NP"] #these rules can be tried to show that multiple kinds of chunking work
rules = []
##create rule objects
for ruleString in NP_rules:
newRule = Rule(ruleString, "NP")
rules.append(newRule)
myChunks = open("superchunks.txt","a")
for tree in test_trees:
for rule in rules:
tree = rule.chunk(tree, rule, 15)
printTree(tree, myChunks)
#########################################################