/
build_suc_ne.py
127 lines (98 loc) · 3.97 KB
/
build_suc_ne.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Tagger configuration for named entities using the SUC corpus + the Stockholm
# Internet Corpus (SIC). It requires SUC-tagged and lemmatized data as input,
# and is intended for use with the Swedish annotation pipeline.
from options import args
from configuration import Configuration
from form import *
from tagset import Tagset
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict
import sys
assert args.n_train_fields == 4
config = Configuration('suc_ne', args)
if config.skip_generate:
config.build()
sys.exit(0)
# Read tagset and tag lexicon (coarse SUC -> UD) from corpus
suc_ne_tags, suc_norm_ne_tags = read_dict(
'suc-data/suc-blogs-ne-train.tab', 1, 3)
# Create a Tagset object from the tags we have read
SUC_NE = Tagset(suc_ne_tags, config)
text_field = 0
lemma_field = 1
suc_full_field = 2
tag_field = 3
Names = WCLexicon.from_file('names', 'suc-data/names.txt', config)
WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config)
# Define tags (relative to the current position during a search)
this_tag = SUC_NE.tag(tag_field, 0)
last_tag = SUC_NE.tag(tag_field, -1)
last_last_tag = SUC_NE.tag(tag_field, -2)
# POS tags (+ morphology)
this_pos = TextField(suc_full_field, 0)
last_pos = TextField(suc_full_field, -1)
next_pos = TextField(suc_full_field, 2)
# Define lemmas (relative to the current position during a search)
this_lemma = TextField(lemma_field, 0)
last_lemma = TextField(lemma_field, -1)
next_lemma = TextField(lemma_field, 1)
next_next_lemma = TextField(lemma_field, 2)
# Define words (relative to the current position during a search)
this_word = TextField(text_field, 0)
last_word = TextField(text_field, -1)
next_word = TextField(text_field, 1)
next_next_word = TextField(text_field, 2)
# Use name lexicon
this_name = Names.lookup(this_lemma)
last_name = Names.lookup(last_lemma)
next_name = Names.lookup(next_lemma)
# Use case-sensitive word clusters
this_wc = WC.lookup(this_word)
last_wc = WC.lookup(last_word)
next_wc = WC.lookup(next_word)
next_next_wc = WC.lookup(next_next_word)
# Each tuple below represents a single feature template.
fs = FeatureSet([
(this_tag, last_tag),
(this_tag, last_tag, last_last_tag),
(this_tag, last_pos),
(this_tag, this_pos),
(this_tag, next_pos),
(this_tag, last_name),
(this_tag, this_name),
(this_tag, next_name),
(this_tag, this_name, last_name),
(this_tag, this_name, next_name),
(this_tag, last_wc),
(this_tag, this_wc),
(this_tag, next_wc),
(this_tag, next_wc, next_next_wc),
(this_tag, last_wc, next_wc),
(this_tag, delexicalize(this_word)),
(this_tag, abstract(this_word)),
(this_tag, this_lemma),
(this_tag, this_lemma, last_lemma),
(this_tag, this_lemma, next_lemma),
(this_tag, next_lemma),
(this_tag, last_lemma),
# This is actually required for the tag lexicon to work properly
(this_tag, normalize(this_lemma)),
(this_tag, prefix(normalize(this_word), 1)),
(this_tag, prefix(normalize(this_word), 2)),
(this_tag, prefix(normalize(this_word), 3)),
(this_tag, prefix(normalize(this_word), 4)),
(this_tag, suffix(normalize(this_word), 1)),
(this_tag, suffix(normalize(this_word), 2)),
(this_tag, suffix(normalize(this_word), 3)),
(this_tag, suffix(normalize(this_word), 4))
], config)
open_tags = sorted(SUC_NE.tag_idx[tag] for tag in SUC_NE.tags)
# Create a TagLexicon object from the tag lexicon we loaded with read_dict()
# above.
tl = TagLexicon('SUC_NE_lexicon', lemma_field, len(suc_norm_ne_tags),
open_tags, config)
for norm, tags in suc_norm_ne_tags.items():
tl[norm] = [SUC_NE.tag_idx[tag] for tag in tags]
# Generate C code and (optionally) compile.
config.build()