forked from UFAL-DSG/tgen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_tgen.py
executable file
·319 lines (268 loc) · 11.4 KB
/
run_tgen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Generating T-trees from dialogue acts.
Usage: ./tgen.py <action> <argument1 ...>
Actions:
candgen_train -- train candidate generator (probability distributions)
- arguments: [-l] [-n] [-p prune_threshold] train-das train-ttrees output-model
* l = create lexicalized candgen (limit using parent lemmas as well as formemes)
* n = engage limits on number of nodes (on depth levels + total number)
percrank_train -- train perceptron global ranker
- arguments: [-d debug-output] [-c candgen-model] [-s data-portion] [-j parallel-jobs] [-w parallel-work-dir] \\
[-e experiment_id] ranker-config train-das train-ttrees output-model
sample_gen -- generate using the given candidate generator
- arguments: [-n trees-per-da] [-o oracle-eval-ttrees] [-w output-ttrees] candgen-model test-das
asearch_gen -- generate using the A*search sentence planner
- arguments: [-e eval-ttrees-file] [-s eval-ttrees-selector] [-d debug-output] [-w output-ttrees] [-c config] candgen-model percrank-model test-das
"""
from __future__ import unicode_literals
import sys
from getopt import getopt
import platform
import os
from alex.components.nlg.tectotpl.core.util import file_stream
from alex.components.nlg.tectotpl.core.document import Document
from flect.config import Config
from tgen.logf import log_info, set_debug_stream, log_debug
from tgen.futil import read_das, read_ttrees, chunk_list, add_bundle_text, \
trees_from_doc, ttrees_from_doc, write_ttrees
from tgen.candgen import RandomCandidateGenerator
from tgen.rank import PerceptronRanker
from tgen.planner import SamplingPlanner, ASearchPlanner
from tgen.eval import p_r_f1_from_counts, corr_pred_gold, f1_from_counts, ASearchListsAnalyzer, \
EvalTypes, Evaluator
from tgen.tree import TreeData
from tgen.parallel_percrank_train import ParallelRanker
from tgen.rank_nn import SimpleNNRanker, EmbNNRanker
from tgen.debug import exc_info_hook
# Start IPdb on error in interactive mode
sys.excepthook = exc_info_hook
def candgen_train(args):
opts, files = getopt(args, 'p:lnc:s')
prune_threshold = 1
parent_lemmas = False
node_limits = False
comp_type = None
comp_limit = None
comp_slots = False
for opt, arg in opts:
if opt == '-p':
prune_threshold = int(arg)
elif opt == '-l':
parent_lemmas = True
elif opt == '-n':
node_limits = True
elif opt == '-c':
comp_type = arg
if ':' in comp_type:
comp_type, comp_limit = comp_type.split(':', 1)
comp_limit = int(comp_limit)
elif opt == '-s':
comp_slots = True
if len(files) != 3:
sys.exit(__doc__)
fname_da_train, fname_ttrees_train, fname_cand_model = files
log_info('Training candidate generator...')
candgen = RandomCandidateGenerator({'prune_threshold': prune_threshold,
'parent_lemmas': parent_lemmas,
'node_limits': node_limits,
'compatible_dais_type': comp_type,
'compatible_dais_limit': comp_limit,
'compatible_slots': comp_slots})
candgen.train(fname_da_train, fname_ttrees_train)
candgen.save_to_file(fname_cand_model)
def percrank_train(args):
opts, files = getopt(args, 'c:d:s:j:w:e:')
candgen_model = None
train_size = 1.0
parallel = False
jobs_number = 0
work_dir = None
experiment_id = None
for opt, arg in opts:
if opt == '-d':
set_debug_stream(file_stream(arg, mode='w'))
elif opt == '-s':
train_size = float(arg)
elif opt == '-c':
candgen_model = arg
elif opt == '-j':
parallel = True
jobs_number = int(arg)
elif opt == '-w':
work_dir = arg
elif opt == '-e':
experiment_id = arg
if len(files) != 4:
sys.exit(__doc__)
fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
log_info('Training perceptron ranker...')
rank_config = Config(fname_rank_config)
if candgen_model:
rank_config['candgen_model'] = candgen_model
if rank_config.get('nn'):
if rank_config['nn'] == 'emb':
ranker_class = EmbNNRanker
else:
ranker_class = SimpleNNRanker
else:
ranker_class = PerceptronRanker
if not parallel:
ranker = ranker_class(rank_config)
else:
rank_config['jobs_number'] = jobs_number
if work_dir is None:
work_dir, _ = os.path.split(fname_rank_config)
ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class)
ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)
ranker.save_to_file(fname_rank_model)
def sample_gen(args):
opts, files = getopt(args, 'r:n:o:w:')
num_to_generate = 1
oracle_eval_file = None
fname_ttrees_out = None
for opt, arg in opts:
if opt == '-n':
num_to_generate = int(arg)
elif opt == '-o':
oracle_eval_file = arg
elif opt == '-w':
fname_ttrees_out = arg
if len(files) != 2:
sys.exit(__doc__)
fname_cand_model, fname_da_test = files
# load model
log_info('Initializing...')
candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
ranker = candgen
tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker})
# generate
log_info('Generating...')
gen_doc = Document()
das = read_das(fname_da_test)
for da in das:
for _ in xrange(num_to_generate): # repeat generation n times
tgen.generate_tree(da, gen_doc)
# evaluate if needed
if oracle_eval_file is not None:
log_info('Evaluating oracle F1...')
log_info('Loading gold data from ' + oracle_eval_file)
gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector)
gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)
log_info('Gold data loaded.')
correct, predicted, gold = 0, 0, 0
for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)):
# find best of predicted trees (in terms of F1)
_, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g
in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree),
gen_trees)],
key=lambda x: x[0])
correct += tc
predicted += tp
gold += tg
# evaluate oracle F1
log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold))
# write output
if fname_ttrees_out is not None:
log_info('Writing output...')
write_ttrees(gen_doc, fname_ttrees_out)
def asearch_gen(args):
"""A*search generation"""
opts, files = getopt(args, 'e:d:w:c:s:')
eval_file = None
fname_ttrees_out = None
cfg_file = None
eval_selector = ''
for opt, arg in opts:
if opt == '-e':
eval_file = arg
elif opt == '-s':
eval_selector = arg
elif opt == '-d':
set_debug_stream(file_stream(arg, mode='w'))
elif opt == '-w':
fname_ttrees_out = arg
elif opt == '-c':
cfg_file = arg
if len(files) != 3:
sys.exit('Invalid arguments.\n' + __doc__)
fname_cand_model, fname_rank_model, fname_da_test = files
log_info('Initializing...')
candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
ranker = PerceptronRanker.load_from_file(fname_rank_model)
cfg = Config(cfg_file) if cfg_file else {}
cfg.update({'candgen': candgen, 'ranker': ranker})
tgen = ASearchPlanner(cfg)
log_info('Generating...')
das = read_das(fname_da_test)
if eval_file is None:
gen_doc = Document()
else:
eval_doc = read_ttrees(eval_file)
if eval_selector == tgen.selector:
gen_doc = Document()
else:
gen_doc = eval_doc
# generate and evaluate
if eval_file is not None:
# generate + analyze open&close lists
lists_analyzer = ASearchListsAnalyzer()
for num, (da, gold_tree) in enumerate(zip(das,
trees_from_doc(eval_doc, tgen.language, eval_selector)),
start=1):
log_debug("\n\nTREE No. %03d" % num)
open_list, close_list = tgen.generate_tree(da, gen_doc, return_lists=True)
lists_analyzer.append(gold_tree, open_list, close_list)
gen_tree = close_list.peek()[0]
if gen_tree != gold_tree:
log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n")
log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats())
# evaluate the generated trees against golden trees
eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)
log_info('Evaluating...')
evaler = Evaluator()
for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das):
# add some stats about the tree directly into the output file
add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore',
"P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))
# collect overall stats
evaler.append(eval_ttree,
gen_ttree,
ranker.score(TreeData.from_ttree(eval_ttree), da),
ranker.score(TreeData.from_ttree(gen_ttree), da))
# print overall stats
log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1())
log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP))
log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.tree_size_stats())
log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats())
log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
evaler.common_subtree_stats())
# just generate
else:
for da in das:
tgen.generate_tree(da, gen_doc)
# write output
if fname_ttrees_out is not None:
log_info('Writing output...')
write_ttrees(gen_doc, fname_ttrees_out)
if __name__ == '__main__':
if len(sys.argv) < 3:
sys.exit(__doc__)
action = sys.argv[1]
args = sys.argv[2:]
log_info('Running on %s version %s' % (platform.python_implementation(),
platform.python_version()))
if action == 'candgen_train':
candgen_train(args)
elif action == 'percrank_train':
percrank_train(args)
elif action == 'random_gen':
sample_gen(args)
elif action == 'asearch_gen':
asearch_gen(args)
else:
# Unknown action
sys.exit(("\nERROR: Unknown Tgen action: %s\n\n---" % action) + __doc__)
log_info('Done.')