/
bootstrapped_run.py
150 lines (119 loc) · 5.62 KB
/
bootstrapped_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import shutil
import tempfile
import sys
import subprocess
import logging
from logger import configure_logger
from io_ import load_doc, LAFDocument
def main(args):
if len(args) != 4:
print("Usage: "+sys.argv[0]+" [language] [working directory] [threshold value] [threshold change step value]")
exit(1)
lang=args[0]
working_dir=os.path.join("/Users/authorofnaught/Projects/LORELEI/NER/WORKING/", os.path.basename(args[1]))
if not os.path.exists(working_dir):
os.mkdir(working_dir)
threshold = args[2]
threshold_step = args[3]
logger = logging.getLogger()
configure_logger(logger)
temp_dir = tempfile.mkdtemp()
"""These directories and files are not updated"""
GAZ_LAF_DIR="/Users/authorofnaught/Projects/LORELEI/NER/REF-LAF/"+lang+"/" # directory containing gold standard LAF files
#REF_LAF_DIR="/Users/authorofnaught/Projects/LORELEI/NER/REF-LAF/"+lang+"/" # directory containing gold standard LAF files
LTF_DIR_ABG="/Users/authorofnaught/Projects/LORELEI/NER/LTF-ABG/"+lang+"/" # directory containing LTF files with uhhmm features
TEST_SCP="/Users/authorofnaught/Projects/LORELEI/NER/TEST-SCP/"+lang+"ALL.txt" # file with paths to LTF files for tagging, one per line
"""These directories and files are updated with each iteration"""
iteration = 0
MODEL_DIR=os.path.join(working_dir, str(iteration), 'model') # directory for trained model
SYS_LAF_DIR=os.path.join(working_dir, str(iteration), 'sys_laf') # directory for tagger output (LAF files)
TRAIN_SCP=os.path.join(temp_dir, 'trainingfiles') # file with paths to LAF files for training, one per line
updateTrainingScript(GAZ_LAF_DIR, TRAIN_SCP) # initialize TRAIN_SCP to contain paths to all gazetteer-generated LAFs
traincmd = ["./train.py",
"--displayprogress", # Display crfsuite output of model iterations, if desired.
"-t", 0.4,
"-S", TRAIN_SCP,
MODEL_DIR,
LTF_DIR_ABG
]
tagcmd = ["./tagger.py",
"-S", TEST_SCP,
"-L", SYS_LAF_DIR,
MODEL_DIR
]
scorecmd = ["./score.py",
REF_LAF_DIR,
SYS_LAF_DIR,
LTF_DIR]
changeinNEs = True:
while changeinNEs:
subprocess.call(traincmd)
subprocess.call(tagcmd)
if iteration != 0:
SYS_LAF_DIR, changeinNEs = updateNEdirs(PREV_SYS_LAF_DIR, SYS_LAF_DIR)
iteration+=1
PREV_SYS_LAF_DIR = SYS_LAF_DIR
MODEL_DIR = os.path.join(working_dir, str(iteration), 'model')
SYS_LAF_DIR = os.path.join(working_dir, str(iteration), 'sys_laf')
updateTrainingScript(PREV_SYS_LAF_DIR, TRAIN_SCP)
# TODO: update threshold for each iteration
print("Bootstrapping stopped after {} iterations".format(iteration))
subprocess.call(scorecmd)
shutil.rmtree(temp_dir)
"""
Update TRAIN_SCP to contain LAF pathnames to be used in training in next iteration
"""
def updateTrainingScript(laf_dir, scriptfile):
with open(scriptfile, 'w') as outfile:
for fn in os.listdir(laf_dir):
if fn.endswith('laf.xml'):
outfile.write("{}\n".format(os.path.join(laf_dir, fn)))
"""
Add new NE mentions to old NE mentions, if any,
"""
def updateNEdirs(prev_laf_dir, temp_laf_dir, new_laf_dir):
changeinNEs = False
for fn in prev_laf_dir:
if fn.endswith('laf.xml'):
prev_laf = os.path.join(prev_laf_dir, fn)
temp_laf = os.path.join(temp_laf_dir, fn)
try:
assert os.path.exists(temp_laf)
except AssertionError:
logging.warn("{} processed last iteration but not this one".format(fn))
for fn in temp_laf_dir:
if fn.endswith('laf.xml'):
prev_laf = os.path.join(prev_laf_dir, fn)
temp_laf = os.path.join(temp_laf_dir, fn)
try:
assert os.path.exists(prev_laf)
except AssertionError:
logging.warn("{} processed this iteration but not the last. Skipping...".format(fn))
continue
prev_laf_doc = load_doc(prev_laf, LAFDocument, logger)
temp_laf_doc = load_doc(temp_laf, LAFDocument, logger)
doc_id = prev_laf_doc.doc_id
prev_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in prev_laf_doc.mentions()]
prev_spans = [(start_char, end_char) for [tag, extent, start_char, end_char] in prev_mentions]
temp_mentions = [[tag, extent, start_char, end_char] for [entity_id, tag, extent, start_char, end_char] in temp_laf_doc.mentions()]
mentions = []
for m in prev_mentions:
mentions.append(m)
for m in temp_mentions:
if (m[2], m[3]) not in prev_spans:
mentions.append(m)
changeinNEs == True
# Sort new mentions list by start_char then end_char
mentions = sorted(mentions, key = lambda x: (int(x[2]), int(x[3])))
n=1
for tag, extent, start_char, end_char in mentions:
entity_id = '{}-NE{}'.format(doc_id, n)
mentions.append([entity_id, tag, extent, start_char, end_char])
n+=1
laf = os.path.join(new_laf_dir, fn)
laf_doc = LAFDocument(mentions=mentions, lang=ltf_doc.lang, doc_id=doc_id)
laf_doc.write_to_file(laf)
return new_laf_dir, changeinNEs
if __name__ == '__main__':
main(sys.argv[1:])