/
chunking.py
209 lines (170 loc) · 6.13 KB
/
chunking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
Chunking: patterns of POS tags that make up a chunk of words
Chink: what words should not be in a chunk
"""
# pylint: disable=C0103
import pickle
from nltk.corpus import conll2000, treebank_chunk
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser, ChunkParserI
from nltk.chunk.util import conlltags2tree, tree2conlltags
from nltk.tag import BigramTagger, UnigramTagger, ClassifierBasedTagger
from samples import quote_1
# Samples and loadings
with open("pickles/pos-taggers/brill_tagger.pickle", "rb") as file:
pos_tagger = pickle.load(file)
tagged_1 = pos_tagger.tag(word_tokenize(quote_1))
# Starts with a determiner, followed by any nouns, followed by an arbritary number of any words
# until another noun is reached. Verbs should be chinked, separating the chunk
chunker_1 = RegexpParser(r"""
NP:
{<DT><NN.*><.*>*<NN.*>}
}<VB.*>{ """)
# Creating the chunks
chunks = chunker_1.parse(tagged_1)
print(f"Chunks: {chunks}")
chunks.draw()
# Chunking with merge and split rules.
# Line 1 - Chunk rule: grab from a determiner to a noun.
# Line 2 - Split rule: split between a noun and anything else.
# Line 3 - Split rule: split between anything and a determiner.
# Line 4 - Merge rule: merge nouns together
chunker_2 = RegexpParser(r"""
NP:
{<DT><.*>*<NN.*>}
<NN.*>}{<.*>
<.*>}{<DT>
<NN.*>{}<NN.*> """)
chunks = chunker_2.parse(tagged_1)
print(f"Chunks: {chunks}")
chunks.draw()
# Making a chunker and testing it's accuracy
# 1.1: Chunk optional determiner with nouns
# 1.2: Merge adjective with noun chunk
# 2.1: Chunk preposition
# 3.1: Chunk optional modal with verb
chunker = RegexpParser(r'''
NP:
{<DT>?<NN.*>+}
<JJ>{}<NN.*>
PP:
{<IN>}
VP:
{<MD>?<VB.*>}
''')
score = chunker.evaluate(conll2000.chunked_sents())
print(f"Accuracy of regex chunker: {score.accuracy()}")
# Tagger-based chunker
def conll_tag_chunks(chunk_sents):
"""
Extracts a list of tuples (pos, iob) from a list of trees.
"""
tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def make_backoffs(training, tagger_classes, backoff=None):
"""
Function for training and make chains of backoff tagger
"""
# Make a tagger using the previous one as a backoff
for cls in tagger_classes:
backoff = cls(training, backoff=backoff)
return backoff
class TagChunker(ChunkParserI): # pylint: disable = W0223
"""
Class implementation of the tag-based chunker
"""
def __init__(self, train_chunks, tagger=None, tagger_classes=[UnigramTagger, BigramTagger]): # pylint: disable=W0102
train_sents = conll_tag_chunks(train_chunks)
if tagger is None:
self.tagger = make_backoffs(train_sents, tagger_classes)
else:
self.tagger = tagger
def parse(self, tokens):
"""
Parse sentence to chunks
"""
if not tokens:
return None
(words, tags) = zip(*tokens)
gen_chunks = self.tagger.tag(tags)
wtc = zip(words, gen_chunks)
return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])
# Separating data and getting chunker accuracy
train_ck = treebank_chunk.chunked_sents()[:3000]
test_ck = treebank_chunk.chunked_sents()[3000:]
train_conll = conll2000.chunked_sents("train.txt")
test_conll = conll2000.chunked_sents("test.txt")
# With unigram and bigram taggers
chunker = TagChunker(train_ck)
score = chunker.evaluate(test_ck)
print(f"Accuracy of tag chunker on treebank: {score.accuracy()}")
# Saving pickle
with open('pickles/chunkers/tag_chunker_treebank.pickle', 'wb') as file:
pickle.dump(chunker, file)
chunker = TagChunker(train_conll)
score = chunker.evaluate(test_conll)
print(f"Accuracy of tag chunker on conll2000: {score.accuracy()}")
# Saving pickle
with open('pickles/chunkers/tag_chunker_conll2000.pickle', 'wb') as file:
pickle.dump(chunker, file)
# Classification-based chunking
def chunk_trees2train_chunks(chunk_sents):
"""
Convert tuples (word, pos, iob) ot ((word, pos), iob)
"""
tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
return [[((w, t), c) for (w, t, c) in sent] for sent in tag_sents]
def prev_next_pos_iob(tokens, index, history):
"""
Feature detector function for the classifier
"""
word, pos = tokens[index]
if index == 0:
prevword, prevpos, previob = ('<START>',) * 3
else:
prevword, prevpos = tokens[index - 1]
previob = history[index - 1]
if index == len(tokens) - 1:
nextword, nextpos = ('<END>',) * 2
else:
nextword, nextpos = tokens[index + 1]
feats = {
'word': word,
'pos': pos,
'nextword': nextword,
'nextpos': nextpos,
'prevword': prevword,
'prevpos': prevpos,
'previob': previob
}
return feats
class ClassifierChunker(ChunkParserI): # pylint: disable = W0223
"""
Classifier-based chunker class implementation
"""
def __init__(self, train_sents, feature_detector, **kwargs):
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(
train=train_chunks, feature_detector=feature_detector, **kwargs)
def parse(self, tokens):
"""
Parse sentence into chunks
"""
if not tokens:
return None
chunked = self.tagger.tag(tokens)
return conlltags2tree([(w, t, c) for ((w, t), c) in chunked])
# Testing accuracy with treebank
cl_chunker = ClassifierChunker(train_ck, prev_next_pos_iob)
score = cl_chunker.evaluate(test_ck)
print(f"Accuracy of classifier chunker on treebank: {score.accuracy()}")
# Saving pickle
with open('pickles/chunkers/classifier_chunker_treebank.pickle', 'wb') as file:
pickle.dump(cl_chunker, file)
# Testing accuracy with conll2000
cl_chunker = ClassifierChunker(train_conll, prev_next_pos_iob)
score = cl_chunker.evaluate(test_conll)
print(f"Accuracy of classifier chunker on conll2000: {score.accuracy()}")
# Saving pickle
with open('pickles/chunkers/classifier_chunker_conll2000.pickle', 'wb') as file:
pickle.dump(cl_chunker, file)