def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DialogueAct() empty_da.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def evaluate_file(self, das_file, ttree_file): """Evaluate the reranking classifier on a given pair of DA/tree files (show the total Hamming distance and total number of DAIs) @param das_file: DA file path @param ttree_file: trees/sentences file path @return: a tuple (total DAIs, distance) """ das = read_das(das_file) ttree_doc = read_ttrees(ttree_file) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) da_len = 0 dist = 0 for da, tree in zip(das, trees): da_len += len(da) dist += self.dist_to_da(da, [tree])[0] return da_len, dist
def evaluate_file(self, das_file, ttree_file): """Evaluate the reranking classifier on a given pair of DA/tree files (show the total Hamming distance and total number of DAIs) @param das_file: DA file path @param ttree_file: trees/sentences file path @return: a tuple (total DAIs, distance) """ log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees/tokens from ' + ttree_file + '...') trees = read_trees_or_tokens(ttree_file, self.mode, self.language, self.selector) if self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas') tot_len = 0 tot_dist = 0 classif_das = [] for da, tree in zip(das, trees): tot_len += len(da) dist, classif = self.dist_to_da(da, [tree], return_classif=True) tot_dist += dist[0] classif_das.append(DA.parse_features(classif[0])) return tot_len, tot_dist, classif_das
def get_training_das_texts(): if DATASET_WEBNLG: return get_das_texts_from_webnlg( 'WebNLG_Reader/data/webnlg/train.json') das = read_das("tgen/e2e-challenge/input/train-das.txt") texts = [[START_TOK] + x + [END_TOK] for x in get_texts_training()] return das, texts
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def sample_gen(args): from pytreex.core.document import Document opts, files = getopt(args, 'r:n:o:w:') num_to_generate = 1 oracle_eval_file = None fname_ttrees_out = None for opt, arg in opts: if opt == '-n': num_to_generate = int(arg) elif opt == '-o': oracle_eval_file = arg elif opt == '-w': fname_ttrees_out = arg if len(files) != 2: sys.exit(__doc__) fname_cand_model, fname_da_test = files # load model log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = candgen tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker}) # generate log_info('Generating...') gen_doc = Document() das = read_das(fname_da_test) for da in das: for _ in xrange(num_to_generate): # repeat generation n times tgen.generate_tree(da, gen_doc) # evaluate if needed if oracle_eval_file is not None: log_info('Evaluating oracle F1...') log_info('Loading gold data from ' + oracle_eval_file) gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector) gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Gold data loaded.') correct, predicted, gold = 0, 0, 0 for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)): # find best of predicted trees (in terms of F1) _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree), gen_trees)], key=lambda x: x[0]) correct += tc predicted += tp gold += tg # evaluate oracle F1 log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold)) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def _load_valid_data(self, valid_data_paths): """Load validation data from separate files (comma-separated list of files with DAs, trees, and optionally contexts is expected).""" # parse validation data file specification valid_data_paths = valid_data_paths.split(',') if len( valid_data_paths ) == 3: # with contexts (this does not determine if they're used) valid_das_file, valid_trees_file, valid_context_file = valid_data_paths else: valid_das_file, valid_trees_file = valid_data_paths # load the validation data log_info('Reading DAs from ' + valid_das_file + '...') self.valid_das = read_das(valid_das_file) self.valid_trees = self._load_trees(valid_trees_file, selector=self.ref_selectors) if self.use_context: self.valid_das = self._load_contexts(self.valid_das, valid_context_file) # reorder validation data for multiple references (see also _cut_valid_data) valid_size = len(self.valid_trees) if self.multiple_refs: num_refs, refs_stored = self._check_multiple_ref_type(valid_size) # serial: different instances next to each other, then synonymous in the same order if refs_stored == 'serial': valid_tree_chunks = [ chunk for chunk in chunk_list(self.valid_trees, valid_size / num_refs) ] self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks] for i in xrange(valid_size / num_refs)] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[0:valid_size / num_refs] # parallel: synonymous instances next to each other elif refs_stored == 'parallel': self.valid_trees = [ chunk for chunk in chunk_list(self.valid_trees, num_refs) ] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[::num_refs] # no multiple references; make lists of size 1 to simplify working with the data else: self.valid_trees = [[tree] for tree in self.valid_trees]
def get_test_das(): if DATASET_WEBNLG: if VALIDATION_NOT_TEST: return get_das_texts_from_webnlg( "WebNLG_Reader/data/webnlg/valid.json")[0] else: return get_das_texts_from_webnlg( "WebNLG_Reader/data/webnlg/test.json")[0] if VALIDATION_NOT_TEST: das_file = "tgen/e2e-challenge/input/devel-das.txt" else: das_file = "tgen/e2e-challenge/input/test-das.txt" das = read_das(das_file) return das
def _load_valid_data(self, valid_data_paths): """Load validation data from separate files (comma-separated list of files with DAs, trees, and optionally contexts is expected).""" # parse validation data file specification valid_data_paths = valid_data_paths.split(',') if len(valid_data_paths) == 3: # with contexts (this does not determine if they're used) valid_das_file, valid_trees_file, valid_context_file = valid_data_paths else: valid_das_file, valid_trees_file = valid_data_paths # load the validation data log_info('Reading DAs from ' + valid_das_file + '...') self.valid_das = read_das(valid_das_file) self.valid_trees = self._load_trees(valid_trees_file, selector=self.ref_selectors) if self.use_context: self.valid_das = self._load_contexts(self.valid_das, valid_context_file) # reorder validation data for multiple references (see also _cut_valid_data) valid_size = len(self.valid_trees) if self.multiple_refs: num_refs, refs_stored = self._check_multiple_ref_type(valid_size) # serial: different instances next to each other, then synonymous in the same order if refs_stored == 'serial': valid_tree_chunks = [chunk for chunk in chunk_list(self.valid_trees, valid_size / num_refs)] self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks] for i in xrange(valid_size / num_refs)] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[0:valid_size / num_refs] # parallel: synonymous instances next to each other elif refs_stored == 'parallel': self.valid_trees = [chunk for chunk in chunk_list(self.valid_trees, num_refs)] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[::num_refs] # no multiple references; make lists of size 1 to simplify working with the data else: self.valid_trees = [[tree] for tree in self.valid_trees]
def train(self, das, trees, data_portion=1.0, valid_das=None, valid_trees=None): # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_das = das[:train_size] self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names()))
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser() ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) log_info(tgen.get_slot_err_stats()) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_trees) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): write_tokens(gen_trees, args.output_file) else: write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector), args.output_file)
if len(files) != 2: sys.exit( 'Usage: python inspect_data.py [-l lang] [-s selector] <trees.yaml> <das.txt>' ) language = 'en' selector = '' for opt, arg in opts: if opt == '-l': language = arg elif opt == '-s': selector = arg trees = trees_from_doc(read_ttrees(files[0]), language, selector) das = read_das(files[1]) # TREE SIZES tree_sizes = defaultdict(int) for tree in trees: tree_sizes[len(tree)] += 1 print "Tree sizes:\n===========" for k, v in sorted(tree_sizes.items()): print k, "\t", v # DAS -> NODES das_for_nodes = {} num_occ_nodes = defaultdict(int)
opts, files = getopt(sys.argv[1:], 'l:s:') if len(files) != 2: sys.exit('Usage: python inspect_data.py [-l lang] [-s selector] <trees.yaml> <das.txt>') language = 'en' selector = '' for opt, arg in opts: if opt == '-l': language = arg elif opt == '-s': selector = arg trees = trees_from_doc(read_ttrees(files[0]), language, selector) das = read_das(files[1]) # TREE SIZES tree_sizes = defaultdict(int) for tree in trees: tree_sizes[len(tree)] += 1 print "Tree sizes:\n===========" for k, v in sorted(tree_sizes.items()): print k, "\t", v # DAS -> NODES das_for_nodes = {}
def _init_training(self, das_file, ttree_file, data_portion, context_file, validation_files): """Load training data, prepare batches, build the NN. @param das_file: training DAs (file path) @param ttree_file: training t-trees (file path) @param data_portion: portion of the data to be actually used for training @param context_file: training contexts (file path) @param validation_files: validation file paths (or None) """ # read training data log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) trees = self._load_trees(ttree_file) if self.use_context: das = self._load_contexts(das, context_file) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # load separate validation data files... if validation_files: self._load_valid_data(validation_files) # ... or save part of the training data for validation: elif self.validation_size > 0: self._cut_valid_data( ) # will set train_trees, valid_trees, train_das, valid_das log_info('Using %d training, %d validation instances.' % (len(self.train_das), len(self.valid_das))) # initialize embeddings if self.use_context: self.da_embs = ContextDAEmbeddingSeq2SeqExtract(cfg=self.cfg) else: self.da_embs = DAEmbeddingSeq2SeqExtract(cfg=self.cfg) if self.use_tokens: self.tree_embs = TokenEmbeddingSeq2SeqExtract(cfg=self.cfg) else: self.tree_embs = TreeEmbeddingSeq2SeqExtract(cfg=self.cfg) self.da_dict_size = self.da_embs.init_dict(self.train_das) self.tree_dict_size = self.tree_embs.init_dict(self.train_trees) self.max_tree_len = self.tree_embs.get_embeddings_shape()[0] self.max_da_len = self.da_embs.get_embeddings_shape()[0] # prepare training batches self.train_enc = [ cut_batch_into_steps(b) for b in grouper( [self.da_embs.get_embeddings(da) for da in self.train_das], self.batch_size, None) ] self.train_dec = [ cut_batch_into_steps(b) for b in grouper([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ], self.batch_size, None) ] # train the classifier for filtering n-best lists if self.classif_filter: self.classif_filter.train(self.train_das, self.train_trees, valid_das=self.valid_das, valid_trees=self.valid_trees) self.classif_filter.restore_checkpoint( ) # restore the best performance on devel data # convert validation data to flat trees to enable F1 measuring if self.validation_size > 0 and self.use_tokens: self.valid_trees = self._valid_data_to_flat_trees(self.valid_trees) # initialize top costs self.top_k_costs = [float('nan')] * self.top_k self.checkpoint_path = None # build the NN self._init_neural_network() # initialize the NN variables self.session.run(tf.initialize_all_variables())
#!/usr/bin/env python from flect.config import Config from tgen.features import Features from tgen.futil import trees_from_doc, read_ttrees, read_das import sys import timeit import datetime if len(sys.argv[1:]) != 3: sys.exit('Usage: ./bench_feats.py features_cfg.py trees.yaml.gz das.txt') print >> sys.stderr, 'Loading...' cfg = Config(sys.argv[1]) trees = trees_from_doc(read_ttrees(sys.argv[2]), 'en', '') das = read_das(sys.argv[3]) feats = Features(cfg['features']) def test_func(): for tree, da in zip(trees, das): feats.get_features(tree, {'da': da}) print >> sys.stderr, 'Running test...' secs = timeit.timeit('test_func()', setup='from __main__ import test_func', number=10) td = datetime.timedelta(seconds=secs) print >> sys.stderr, 'Time taken: %s' % str(td)
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser() ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instsructions, for tokens only)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] # prepare evaluation if args.eval_file is None or args.eval_file.endswith('.txt'): # just tokens gen_doc = [] else: # Trees: depending on PyTreex from pytreex.core.document import Document eval_doc = read_ttrees(args.eval_file) if args.ref_selector == args.target_selector: gen_doc = Document() else: gen_doc = eval_doc if args.eval_file: tgen.init_slot_err_stats() # generate log_info('Generating...') tgen.selector = args.target_selector # override target selector for generation for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) tgen.generate_tree(da, gen_doc) # evaluate if args.eval_file is not None: log_info(tgen.get_slot_err_stats()) # evaluate the generated tokens (F1 and BLEU scores) if args.eval_file.endswith('.txt'): lexicalize_tokens(gen_doc, lexicalization_from_doc(args.abstr_file)) eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_doc) # evaluate the generated trees against golden trees else: eval_trees(das, ttrees_from_doc(eval_doc, tgen.language, args.ref_selector), ttrees_from_doc(gen_doc, tgen.language, args.target_selector), eval_doc, tgen.language, tgen.selector) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): write_tokens(gen_doc, args.output_file) else: write_ttrees(gen_doc, args.output_file)
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip( das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores( da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip( eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text( eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info( "Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees( trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # delexicalize if DAs are lexicalized and we don't want that if self.delex_slots: self.train_das = [ da.get_delexicalized(self.delex_slots) for da in self.train_das ] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names())) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.global_variables_initializer())
def _init_training(self, das_file, ttree_file, data_portion, context_file, validation_files): """Load training data, prepare batches, build the NN. @param das_file: training DAs (file path) @param ttree_file: training t-trees (file path) @param data_portion: portion of the data to be actually used for training @param context_file: training contexts (file path) @param validation_files: validation file paths (or None) """ # read training data log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) trees = self._load_trees(ttree_file) if self.use_context: das = self._load_contexts(das, context_file) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # load separate validation data files... if validation_files: self._load_valid_data(validation_files) # ... or save part of the training data for validation: elif self.validation_size > 0: self._cut_valid_data() # will set train_trees, valid_trees, train_das, valid_das log_info('Using %d training, %d validation instances.' % (len(self.train_das), len(self.valid_das))) # initialize embeddings if self.use_context: self.da_embs = ContextDAEmbeddingSeq2SeqExtract(cfg=self.cfg) else: self.da_embs = DAEmbeddingSeq2SeqExtract(cfg=self.cfg) if self.use_tokens: self.tree_embs = TokenEmbeddingSeq2SeqExtract(cfg=self.cfg) else: self.tree_embs = TreeEmbeddingSeq2SeqExtract(cfg=self.cfg) self.da_dict_size = self.da_embs.init_dict(self.train_das) self.tree_dict_size = self.tree_embs.init_dict(self.train_trees) self.max_tree_len = self.tree_embs.get_embeddings_shape()[0] self.max_da_len = self.da_embs.get_embeddings_shape()[0] # prepare training batches self.train_enc = [cut_batch_into_steps(b) for b in grouper([self.da_embs.get_embeddings(da) for da in self.train_das], self.batch_size, None)] self.train_dec = [cut_batch_into_steps(b) for b in grouper([self.tree_embs.get_embeddings(tree) for tree in self.train_trees], self.batch_size, None)] # train the classifier for filtering n-best lists if self.classif_filter: self.classif_filter.train(self.train_das, self.train_trees, valid_das=self.valid_das, valid_trees=self.valid_trees) self.classif_filter.restore_checkpoint() # restore the best performance on devel data # convert validation data to flat trees to enable F1 measuring if self.validation_size > 0 and self.use_tokens: self.valid_trees = self._valid_data_to_flat_trees(self.valid_trees) # initialize top costs self.top_k_costs = [float('nan')] * self.top_k self.checkpoint_path = None # build the NN self._init_neural_network() # initialize the NN variables self.session.run(tf.initialize_all_variables())
from flect.config import Config from tgen.features import Features from tgen.futil import trees_from_doc, read_ttrees, read_das import sys import timeit import datetime if len(sys.argv[1:]) != 3: sys.exit('Usage: ./bench_feats.py features_cfg.py trees.yaml.gz das.txt') print >> sys.stderr, 'Loading...' cfg = Config(sys.argv[1]) trees = trees_from_doc(read_ttrees(sys.argv[2]), 'en', '') das = read_das(sys.argv[3]) feats = Features(cfg['features']) def test_func(): for tree, da in zip(trees, das): feats.get_features(tree, {'da': da}) print >> sys.stderr, 'Running test...' secs = timeit.timeit('test_func()', setup='from __main__ import test_func', number=10) td = datetime.timedelta(seconds=secs) print >> sys.stderr, 'Time taken: %s' % str(td)
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip(das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files (DAs, contexts) das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] elif tgen.use_context or tgen.context_bleu_weight: log_warn('Generator is trained to use context. ' + 'Using empty contexts, expect lower performance.') das = [([], da) for da in das] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) if num % 100 == 0: log_info("Generated tree %d" % num) log_info(tgen.get_slot_err_stats()) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # we won't need contexts anymore, but we do need DAs if tgen.use_context or tgen.context_bleu_weight: das = [da for _, da in das] # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), [t.to_tok_list() for t in gen_trees]) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): gen_toks = [t.to_tok_list() for t in gen_trees] postprocess_tokens(gen_toks, das) write_tokens(gen_toks, args.output_file) else: write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector), args.output_file)
def seq2seq_gen(args): """Sequence-to-sequence generation""" def write_trees_or_tokens(output_file, das, gen_trees, base_doc, language, selector): """Decide to write t-trees or tokens based on the output file name.""" if output_file.endswith('.txt'): gen_toks = [t.to_tok_list() for t in gen_trees] postprocess_tokens(gen_toks, das) write_tokens(gen_toks, output_file) else: write_ttrees( create_ttree_doc(gen_trees, base_doc, language, selector), output_file) ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument( '-a', '--abstr-file', type=str, help= 'Lexicalization file (a.k.a. abstraction instructions, for postprocessing)' ) ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument( '-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-D', '--delex-output-file', type=str, help='Output file for trees/text before lexicalization') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files (DAs, contexts) das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn( 'Generator is not trained to use context, ignoring context input file.' ) else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] elif tgen.use_context or tgen.context_bleu_weight: log_warn('Generator is trained to use context. ' + 'Using empty contexts, expect lower performance.') das = [([], da) for da in das] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) if num % 100 == 0: log_info("Generated tree %d" % num) log_info(tgen.get_slot_err_stats()) if args.delex_output_file is not None: log_info('Writing delex output...') write_trees_or_tokens(args.delex_output_file, das, gen_trees, None, tgen.language, args.target_selector or tgen.selector) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # we won't need contexts anymore, but we do need DAs if tgen.use_context or tgen.context_bleu_weight: das = [da for _, da in das] # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), [t.to_tok_list() for t in gen_trees]) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') write_trees_or_tokens(args.output_file, das, gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector)
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # delexicalize if DAs are lexicalized and we don't want that if self.delex_slots: self.train_das = [da.get_delexicalized(self.delex_slots) for da in self.train_das] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names())) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.global_variables_initializer())