/
chunkers.py
66 lines (51 loc) · 1.99 KB
/
chunkers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import nltk.tag
from nltk.chunk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger
from nltk.corpus import names, ieer, gazetteers
def chunk_trees2train_chunks(chunk_sents):
tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
return [[((w, t), c) for (w, t, c) in sent] for sent in tag_sents]
def prev_next_pos_iob(tokens, index, history):
word, pos = tokens[index]
if index == 0:
prevword, prevpos, previob = ('<START>',) * 3
else:
prevword, prevpos = tokens[index - 1]
previob = history[index - 1]
if index == len(tokens) - 1:
nextword, nextpos = ('<END>',) * 2
else:
nextword, nextpos = tokens[index + 1]
feats = {
'word': word,
'pos': pos,
'nextword': nextword,
'nextpos': nextpos,
'prevword': prevword,
'prevpos': prevpos,
'previob': previob
}
return feats
class ClassifierChunker(ChunkParserI):
def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
if not feature_detector:
feature_detector = self.feature_detector
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(train=train_chunks,
feature_detector=feature_detector, **kwargs)
def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return conlltags2tree([(w, t, c) for ((w, t), c) in chunks])
if __name__ == '__main__':
from nltk.corpus import treebank_chunk, conll2000
from nltk import tokenize
conll_train = conll2000.chunked_sents('train.txt')
train_chunks = treebank_chunk.chunked_sents()
a = ClassifierChunker(conll_train)
sentense = "I am a boy."
tokens = nltk.word_tokenize(sentense)
tagged = nltk.pos_tag(tokens)
print(tagged)
print(a.parse(tagged))