def _prepare_train_toks(self, train_trees, train_abstr_fname, valid_trees=None, valid_abstr_fname=None): """Prepare training data for form selection LM. Use training trees/tagged lemmas/tokens, apply lexicalization instructions including surface forms, and convert the output to a list of lists of tokens (sentences). @param train_trees: main generator training data (trees, tagged lemmas, tokens) @param train_abstr_fname: file name for the corresponding lexicalization instructions @return: list of lists of LM training tokens (lexicalized) """ # load abstraction file abstss = read_absts(train_abstr_fname) if valid_abstr_fname is not None: abstss.extend(read_absts(valid_abstr_fname)) # concatenate training + validation data (will be handled in the same way) trees = list(train_trees) if valid_trees is not None: trees.extend(valid_trees) out_sents = [] for tree, absts in zip(trees, abstss): # validation data may have more paraphrases -> treat them separately # (list of lists or list of TreeData's for self.mode == 'tree') if isinstance(tree[-1], (list, TreeData)): for tree_ in tree: out_sents.append(self._tree_to_sentence(tree_, absts)) # default: one paraphrase else: out_sents.append(self._tree_to_sentence(tree, absts)) # split training/validation data return out_sents[:len(train_trees)], out_sents[len(train_trees):]
def lexicalize(self, gen_trees, abst_file): """Lexicalize nodes in the generated trees (which may represent trees, tokens, or tagged lemmas). Expects lexicalization file (and surface forms file) to be loaded in the Lexicalizer object, otherwise nothing will happen. The actual operation depends on the generator mode. @param gen_trees: list of TreeData objects representing generated trees/tokens/tagged lemmas @param abst_file: abstraction/delexicalization instructions file path @param mode: generator mode (acceptable string values: "trees"/"tokens"/"tagged_lemmas") @return: None """ abstss = read_absts(abst_file) for tree, absts in zip(gen_trees, abstss): sent = self._tree_to_sentence(tree) for idx, tok in enumerate(sent): if tok and tok.startswith('X-'): # we would like to delexicalize slot = tok[2:] # check if we have a value to substitute; if yes, do it abst = self._first_abst(absts, slot) if abst: # tagged lemmas: one token with appropriate value if self.mode == 'tagged_lemmas': tag = sent[idx+1] if idx < len(sent) - 1 else None val = self.get_surface_form(sent, idx, slot, abst.value, tag=tag) tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x') # trees: one node with appropriate value, keep formeme elif self.mode == 'trees': formeme = sent[idx+1] if idx < len(sent) - 1 else None val = self.get_surface_form(sent, idx, slot, abst.value, formeme=formeme) tree.nodes[idx/2+1] = NodeData(t_lemma=val, formeme=tree[idx/2+1].formeme) # tokens: one token with all words from the value (postprocessed below) else: val = self.get_surface_form(sent, idx, slot, abst.value) tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x') sent[idx] = val # save value to be used in LM next time # postprocess tokens (split multi-word nodes) if self.mode == 'tokens': idx = 1 while idx < len(tree): if ' ' in tree[idx].t_lemma: value = tree[idx].t_lemma tree.remove_node(idx) for shift, tok in enumerate(value.split(' ')): tree.create_child(0, idx + shift, NodeData(t_lemma=tok, formeme='x')) idx += shift idx += 1
def _prepare_train_toks(self, train_trees, train_abstr_fname): """Prepare training data for form selection LM. Use training trees/tagged lemmas/tokens, apply lexicalization instructions including surface forms, and convert the output to a list of lists of tokens (sentences). @param train_trees: main generator training data (trees, tagged lemmas, tokens) @param train_abstr_fname: file name for the corresponding lexicalization instructions @return: list of lists of LM training tokens (lexicalized) """ # load abstraction file abstss = read_absts(train_abstr_fname) out_sents = [] for tree, absts in zip(train_trees, abstss): # create a list of tokens out_sent = self._tree_to_sentence(tree) # lexicalize the resulting sentence using abstraction instructions for idx, tok in enumerate(out_sent): if tok.startswith('X-'): abst = self._first_abst(absts, tok[2:]) form = re.sub(r'\b[0-9]+\b', '_', abst.surface_form) # abstract numbers out_sent[idx] = form # store the result out_sents.append(out_sent) return out_sents