Ejemplo n.º 1
0
 def _prepare_train_toks(self,
                         train_trees,
                         train_abstr_fname,
                         valid_trees=None,
                         valid_abstr_fname=None):
     """Prepare training data for form selection LM. Use training trees/tagged lemmas/tokens,
     apply lexicalization instructions including surface forms, and convert the output to a
     list of lists of tokens (sentences).
     @param train_trees: main generator training data (trees, tagged lemmas, tokens)
     @param train_abstr_fname: file name for the corresponding lexicalization instructions
     @return: list of lists of LM training tokens (lexicalized)
     """
     # load abstraction file
     abstss = read_absts(train_abstr_fname)
     if valid_abstr_fname is not None:
         abstss.extend(read_absts(valid_abstr_fname))
     # concatenate training + validation data (will be handled in the same way)
     trees = list(train_trees)
     if valid_trees is not None:
         trees.extend(valid_trees)
     out_sents = []
     for tree, absts in zip(trees, abstss):
         # validation data may have more paraphrases -> treat them separately
         # (list of lists or list of TreeData's for self.mode == 'tree')
         if isinstance(tree[-1], (list, TreeData)):
             for tree_ in tree:
                 out_sents.append(self._tree_to_sentence(tree_, absts))
         # default: one paraphrase
         else:
             out_sents.append(self._tree_to_sentence(tree, absts))
     # split training/validation data
     return out_sents[:len(train_trees)], out_sents[len(train_trees):]
Ejemplo n.º 2
0
    def lexicalize(self, gen_trees, abst_file):
        """Lexicalize nodes in the generated trees (which may represent trees, tokens, or tagged lemmas).
        Expects lexicalization file (and surface forms file) to be loaded in the Lexicalizer object,
        otherwise nothing will happen. The actual operation depends on the generator mode.

        @param gen_trees: list of TreeData objects representing generated trees/tokens/tagged lemmas
        @param abst_file: abstraction/delexicalization instructions file path
        @param mode: generator mode (acceptable string values: "trees"/"tokens"/"tagged_lemmas")
        @return: None
        """
        abstss = read_absts(abst_file)
        for tree, absts in zip(gen_trees, abstss):
            sent = self._tree_to_sentence(tree)
            for idx, tok in enumerate(sent):
                if tok and tok.startswith('X-'):  # we would like to delexicalize
                    slot = tok[2:]
                    # check if we have a value to substitute; if yes, do it
                    abst = self._first_abst(absts, slot)
                    if abst:
                        # tagged lemmas: one token with appropriate value
                        if self.mode == 'tagged_lemmas':
                            tag = sent[idx+1] if idx < len(sent) - 1 else None
                            val = self.get_surface_form(sent, idx, slot, abst.value, tag=tag)
                            tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x')
                        # trees: one node with appropriate value, keep formeme
                        elif self.mode == 'trees':
                            formeme = sent[idx+1] if idx < len(sent) - 1 else None
                            val = self.get_surface_form(sent, idx, slot, abst.value,
                                                        formeme=formeme)
                            tree.nodes[idx/2+1] = NodeData(t_lemma=val,
                                                           formeme=tree[idx/2+1].formeme)
                        # tokens: one token with all words from the value (postprocessed below)
                        else:
                            val = self.get_surface_form(sent, idx, slot, abst.value)
                            tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x')
                        sent[idx] = val  # save value to be used in LM next time
            # postprocess tokens (split multi-word nodes)
            if self.mode == 'tokens':
                idx = 1
                while idx < len(tree):
                    if ' ' in tree[idx].t_lemma:
                        value = tree[idx].t_lemma
                        tree.remove_node(idx)
                        for shift, tok in enumerate(value.split(' ')):
                            tree.create_child(0, idx + shift,
                                              NodeData(t_lemma=tok, formeme='x'))
                        idx += shift
                    idx += 1
Ejemplo n.º 3
0
 def _prepare_train_toks(self, train_trees, train_abstr_fname):
     """Prepare training data for form selection LM. Use training trees/tagged lemmas/tokens,
     apply lexicalization instructions including surface forms, and convert the output to a
     list of lists of tokens (sentences).
     @param train_trees: main generator training data (trees, tagged lemmas, tokens)
     @param train_abstr_fname: file name for the corresponding lexicalization instructions
     @return: list of lists of LM training tokens (lexicalized)
     """
     # load abstraction file
     abstss = read_absts(train_abstr_fname)
     out_sents = []
     for tree, absts in zip(train_trees, abstss):
         # create a list of tokens
         out_sent = self._tree_to_sentence(tree)
         # lexicalize the resulting sentence using abstraction instructions
         for idx, tok in enumerate(out_sent):
             if tok.startswith('X-'):
                 abst = self._first_abst(absts, tok[2:])
                 form = re.sub(r'\b[0-9]+\b', '_',
                               abst.surface_form)  # abstract numbers
                 out_sent[idx] = form
         # store the result
         out_sents.append(out_sent)
     return out_sents