def ids_to_tree(self, emb, postprocess=True): """Create a fake (flat) t-tree from token embeddings (IDs). @param emb: source embeddings (token IDs) @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \ markers)? True by default. @return: the corresponding tree """ tree = TreeData() tokens = self.ids_to_strings(emb) for token in tokens: if token in ['<GO>', '<STOP>', '<VOID>']: continue if postprocess: # casing (only if set to lowercase) if self.lowercase and len(tree) == 1 or tree.nodes[-1].t_lemma in ['.', '?', '!']: token = token[0].upper() + token[1:] # plural merging (if plural tokens come up) if token == '<-s>' and tree.nodes[-1].t_lemma is not None: token = self._singular_to_plural(tree.nodes[-1].t_lemma) tree.remove_node(len(tree) - 1) elif token == '<-s>': continue tree.create_child(0, len(tree), NodeData(token, 'x')) return tree
def ids_to_tree(self, emb): """Rebuild a tree from the embeddings (token IDs). @param emb: source embeddings (token IDs) @return: the corresponding tree """ tree = TreeData() tree.nodes = [] # override the technical root -- the tree will be created including the technical root tree.parents = [] # build the tree recursively (start at position 2 to skip the <GO> symbol and 1st opening bracket) self._create_subtree(tree, -1, emb, 2) return tree
def _beam_search(self, enc_inputs, da): """Run beam search decoding.""" # true "batches" not implemented assert len(enc_inputs[0]) == 1 # run greedy decoder for comparison (debugging purposes) log_debug("GREEDY DEC WOULD RETURN:\n" + " ".join(self.tree_embs.ids_to_strings( [out_tok[0] for out_tok in self._greedy_decoding(enc_inputs, None)[0]]))) # initialize self._init_beam_search(enc_inputs) empty_tree_emb = self.tree_embs.get_embeddings(TreeData()) dec_inputs = cut_batch_into_steps([empty_tree_emb]) paths = [self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]])] # beam search steps for step in xrange(len(dec_inputs)): new_paths = [] for path in paths: out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states) new_paths.extend(path.expand(self.beam_size, out_probs, st)) def cmp_func(p, q): """Length-weighted comparison of two paths' logprobs.""" return cmp(p.logprob / (len(p) ** self.length_norm_weight), q.logprob / (len(q) ** self.length_norm_weight)) paths = sorted(new_paths, cmp=cmp_func, reverse=True)[:self.beam_size] if all([p.dec_inputs[-1] == self.tree_embs.VOID for p in paths]): break # stop decoding if we have reached the end in all paths log_debug(("\nBEAM SEARCH STEP %d\n" % step) + "\n".join([("%f\t" % p.logprob) + " ".join(self.tree_embs.ids_to_strings([inp[0] for inp in p.dec_inputs])) for p in paths]) + "\n") # rerank paths by their distance to the input DA if self.classif_filter or self.context_bleu_weight: paths = self._rerank_paths(paths, da) # measure slot error on the top k paths if self.slot_err_stats: for path in paths[:self.sample_top_k]: self.slot_err_stats.append( da, self.tree_embs.ids_to_strings([inp[0] for inp in path.dec_inputs])) # select the "best" path -- either the best, or one in top k if self.sample_top_k > 1: best_path = self._sample_path(paths[:self.sample_top_k]) else: best_path = paths[0] # return just the best path (as token IDs) return np.array(best_path.dec_inputs)
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def can_generate_greedy(self, tree, da): """Check if the candidate generator can generate a given tree greedily, always pursuing the first viable path. This is for debugging purposes only. Uses `get_all_successors` and always goes on with the first one that increases coverage of the current tree. """ self.init_run(da) cur_subtree = TreeData() found = True while found and cur_subtree != tree: found = False for succ in self.get_all_successors(cur_subtree): # use the first successor that is still a subtree of the target tree if tree.common_subtree_size(succ) == len(succ): cur_subtree = succ found = True break # we have hit a dead end if cur_subtree != tree: log_info('Did not find tree: ' + str(tree) + ' for DA: ' + str(da)) return False # everything alright log_info('Found tree: %s for DA: %s' % (str(tree), str(da))) return True
def _get_greedy_decoder_output(self, enc_inputs, dec_inputs, compute_cost=False): """Run greedy decoding with the given inputs; return decoder outputs and the cost (if required). For ensemble decoding, the gready search is implemented as a beam search with a beam size of 1. @param enc_inputs: encoder inputs (list of token IDs) @param dec_inputs: decoder inputs (list of token IDs) @param compute_cost: if True, decoding cost is computed (the dec_inputs must be valid trees) @return a tuple of list of decoder outputs + decoding cost (None if not required) """ # TODO batches and cost computation not implemented assert len(enc_inputs[0]) == 1 and not compute_cost self._init_beam_search(enc_inputs) # for simplicity, this is implemented exacly like a beam search, but with a path sized one empty_tree_emb = self.tree_embs.get_embeddings(TreeData()) dec_inputs = cut_batch_into_steps([empty_tree_emb]) path = self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]]) for step in xrange(len(dec_inputs)): out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states) path = path.expand(1, out_probs, st)[0] if path.dec_inputs[-1] == self.tree_embs.VOID: break # stop decoding if we have reached the end of path # return just token IDs, ignore cost computation here return np.array(path.dec_inputs), None
def can_generate(self, tree, da): """Check if the candidate generator can generate a given tree at all. This is for debugging purposes only. Tries if get_all_successors always returns a successor that leads to the given tree (puts on the open list only successors that are subtrees of the given tree). """ self.init_run(da) open_list = CandidateList({TreeData(): 1}) found = False tree_no = 0 while open_list and not found: cur_st, _ = open_list.pop() if cur_st == tree: found = True break for succ in self.get_all_successors(cur_st): tree_no += 1 # only push on the open list if the successor is still a subtree of the target tree if tree.common_subtree_size(succ) == len(succ): open_list.push(succ, len(succ)) if not found: log_info('Did not find tree: ' + str(tree) + ' for DA: ' + str(da) + ('(total %d trees)' % tree_no)) return False log_info('Found tree: %s for DA: %s (as %d-th tree)' % (str(tree), str(da), tree_no)) return tree_no
def ids_to_tree(self, emb, postprocess=True): """Create a fake (flat) t-tree from token embeddings (IDs). @param emb: source embeddings (token IDs) @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \ markers)? True by default. @return: the corresponding tree """ tree = TreeData() tokens = self.ids_to_strings(emb) for token in tokens: if token in ['<GO>', '<STOP>', '<VOID>']: continue tree.create_child(0, len(tree), NodeData(token, 'x')) return tree
def _greedy_decoding(self, enc_inputs, gold_trees): """Run greedy decoding with the given encoder inputs; optionally use given gold trees as decoder inputs for cost computation.""" # prepare decoder inputs (either fake, or true but used just for cost computation) if gold_trees is None: empty_tree_emb = self.tree_embs.get_embeddings(TreeData()) dec_inputs = cut_batch_into_steps([empty_tree_emb for _ in enc_inputs[0]]) else: dec_inputs = cut_batch_into_steps([self.tree_embs.get_embeddings(tree) for tree in gold_trees]) # run the decoding per se dec_output_ids, dec_cost = self._get_greedy_decoder_output( enc_inputs, dec_inputs, compute_cost=gold_trees is not None) return dec_output_ids, dec_cost
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip( das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores( da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip( eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text( eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info( "Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees( trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # delexicalize if DAs are lexicalized and we don't want that if self.delex_slots: self.train_das = [ da.get_delexicalized(self.delex_slots) for da in self.train_das ] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names())) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.global_variables_initializer())
def to_treedata(t): if isinstance(t, TreeNode): return t.tree elif isinstance(t, T): return TreeData.from_ttree(t)
from tgen.planner import CandidateList from tgen.tree import TreeData, NodeData import random import zlib random.seed(1206) l = CandidateList() for i in xrange(10000): # l[str(i)] = random.randint(0, 100) # l[str(random.randint(0,1000))] = random.randint(0, 100) # l[(str(random.randint(0,1000)), str(random.randint(0,1000)))] = random.randint(0, 100) # tree = TreeData() # tree.create_child(0, 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000)))) # l[tree] = random.randint(0, 100) tree = TreeData() for j in xrange(random.randint(1, 10)): tree.create_child( random.randint(0, len(tree) - 1), random.randint(0, 1) == 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000))), ) l[tree] = random.randint(0, 100) x = [] while l: x.append(l.pop()) print zlib.crc32(str(x))
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip(das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
from tgen.planner import CandidateList from tgen.tree import TreeData, NodeData import random import zlib random.seed(1206) l = CandidateList() for i in xrange(10000): # l[str(i)] = random.randint(0, 100) # l[str(random.randint(0,1000))] = random.randint(0, 100) # l[(str(random.randint(0,1000)), str(random.randint(0,1000)))] = random.randint(0, 100) # tree = TreeData() # tree.create_child(0, 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000)))) # l[tree] = random.randint(0, 100) tree = TreeData() for j in xrange(random.randint(1, 10)): tree.create_child( random.randint(0, len(tree) - 1), random.randint(0, 1) == 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000)))) l[tree] = random.randint(0, 100) x = [] while l: x.append(l.pop()) print zlib.crc32(str(x))