Esempio n. 1
1
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.
        """
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DialogueAct()
        empty_da.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True)
            self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das]
        self.y = self.da_vect.fit_transform(self.y)

        # initialize I/O shapes
        self.input_shape = [list(self.X[0].shape)]
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
Esempio n. 2
0
    def evaluate_file(self, das_file, ttree_file):
        """Evaluate the reranking classifier on a given pair of DA/tree files (show the
        total Hamming distance and total number of DAIs)

        @param das_file: DA file path
        @param ttree_file: trees/sentences file path
        @return: a tuple (total DAIs, distance)
        """
        das = read_das(das_file)
        ttree_doc = read_ttrees(ttree_file)
        if self.mode == 'tokens':
            tokens = tokens_from_doc(ttree_doc, self.language, self.selector)
            trees = self._tokens_to_flat_trees(tokens)
        elif self.mode == 'tagged_lemmas':
            tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector)
            trees = self._tokens_to_flat_trees(tls)
        else:
            trees = trees_from_doc(ttree_doc, self.language, self.selector)

        da_len = 0
        dist = 0

        for da, tree in zip(das, trees):
            da_len += len(da)
            dist += self.dist_to_da(da, [tree])[0]

        return da_len, dist
Esempio n. 3
0
    def evaluate_file(self, das_file, ttree_file):
        """Evaluate the reranking classifier on a given pair of DA/tree files (show the
        total Hamming distance and total number of DAIs)

        @param das_file: DA file path
        @param ttree_file: trees/sentences file path
        @return: a tuple (total DAIs, distance)
        """
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees/tokens from ' + ttree_file + '...')
        trees = read_trees_or_tokens(ttree_file, self.mode, self.language, self.selector)
        if self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas')

        tot_len = 0
        tot_dist = 0
        classif_das = []
        for da, tree in zip(das, trees):
            tot_len += len(da)
            dist, classif = self.dist_to_da(da, [tree], return_classif=True)
            tot_dist += dist[0]
            classif_das.append(DA.parse_features(classif[0]))

        return tot_len, tot_dist, classif_das
def get_training_das_texts():
    if DATASET_WEBNLG:
        return get_das_texts_from_webnlg(
            'WebNLG_Reader/data/webnlg/train.json')
    das = read_das("tgen/e2e-challenge/input/train-das.txt")
    texts = [[START_TOK] + x + [END_TOK] for x in get_texts_training()]
    return das, texts
Esempio n. 5
0
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.
        """
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False,
                                            binarize_numeric=True)
            self.X = [
                self.tree_feats.get_features(tree, {})
                for tree in self.train_trees
            ]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)

        # initialize I/O shapes
        self.input_shape = [list(self.X[0].shape)]
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
Esempio n. 6
0
    def evaluate_file(self, das_file, ttree_file):
        """Evaluate the reranking classifier on a given pair of DA/tree files (show the
        total Hamming distance and total number of DAIs)

        @param das_file: DA file path
        @param ttree_file: trees/sentences file path
        @return: a tuple (total DAIs, distance)
        """
        das = read_das(das_file)
        ttree_doc = read_ttrees(ttree_file)
        if self.mode == 'tokens':
            tokens = tokens_from_doc(ttree_doc, self.language, self.selector)
            trees = self._tokens_to_flat_trees(tokens)
        elif self.mode == 'tagged_lemmas':
            tls = tagged_lemmas_from_doc(ttree_doc, self.language,
                                         self.selector)
            trees = self._tokens_to_flat_trees(tls)
        else:
            trees = trees_from_doc(ttree_doc, self.language, self.selector)

        da_len = 0
        dist = 0

        for da, tree in zip(das, trees):
            da_len += len(da)
            dist += self.dist_to_da(da, [tree])[0]

        return da_len, dist
Esempio n. 7
0
def sample_gen(args):
    from pytreex.core.document import Document
    opts, files = getopt(args, 'r:n:o:w:')
    num_to_generate = 1
    oracle_eval_file = None
    fname_ttrees_out = None

    for opt, arg in opts:
        if opt == '-n':
            num_to_generate = int(arg)
        elif opt == '-o':
            oracle_eval_file = arg
        elif opt == '-w':
            fname_ttrees_out = arg

    if len(files) != 2:
        sys.exit(__doc__)
    fname_cand_model, fname_da_test = files

    # load model
    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)

    ranker = candgen

    tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker})
    # generate
    log_info('Generating...')
    gen_doc = Document()
    das = read_das(fname_da_test)
    for da in das:
        for _ in xrange(num_to_generate):  # repeat generation n times
            tgen.generate_tree(da, gen_doc)

    # evaluate if needed
    if oracle_eval_file is not None:
        log_info('Evaluating oracle F1...')
        log_info('Loading gold data from ' + oracle_eval_file)
        gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector)
        gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)
        log_info('Gold data loaded.')
        correct, predicted, gold = 0, 0, 0
        for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)):
            # find best of predicted trees (in terms of F1)
            _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g
                                 in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree),
                                        gen_trees)],
                                key=lambda x: x[0])
            correct += tc
            predicted += tp
            gold += tg
        # evaluate oracle F1
        log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold))
    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Esempio n. 8
0
def sample_gen(args):
    from pytreex.core.document import Document
    opts, files = getopt(args, 'r:n:o:w:')
    num_to_generate = 1
    oracle_eval_file = None
    fname_ttrees_out = None

    for opt, arg in opts:
        if opt == '-n':
            num_to_generate = int(arg)
        elif opt == '-o':
            oracle_eval_file = arg
        elif opt == '-w':
            fname_ttrees_out = arg

    if len(files) != 2:
        sys.exit(__doc__)
    fname_cand_model, fname_da_test = files

    # load model
    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)

    ranker = candgen

    tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker})
    # generate
    log_info('Generating...')
    gen_doc = Document()
    das = read_das(fname_da_test)
    for da in das:
        for _ in xrange(num_to_generate):  # repeat generation n times
            tgen.generate_tree(da, gen_doc)

    # evaluate if needed
    if oracle_eval_file is not None:
        log_info('Evaluating oracle F1...')
        log_info('Loading gold data from ' + oracle_eval_file)
        gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector)
        gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)
        log_info('Gold data loaded.')
        correct, predicted, gold = 0, 0, 0
        for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)):
            # find best of predicted trees (in terms of F1)
            _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g
                                 in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree),
                                        gen_trees)],
                                key=lambda x: x[0])
            correct += tc
            predicted += tp
            gold += tg
        # evaluate oracle F1
        log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold))
    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Esempio n. 9
0
    def _load_valid_data(self, valid_data_paths):
        """Load validation data from separate files (comma-separated list of files with DAs, trees,
        and optionally contexts is expected)."""
        # parse validation data file specification
        valid_data_paths = valid_data_paths.split(',')
        if len(
                valid_data_paths
        ) == 3:  # with contexts (this does not determine if they're used)
            valid_das_file, valid_trees_file, valid_context_file = valid_data_paths
        else:
            valid_das_file, valid_trees_file = valid_data_paths

        # load the validation data
        log_info('Reading DAs from ' + valid_das_file + '...')
        self.valid_das = read_das(valid_das_file)
        self.valid_trees = self._load_trees(valid_trees_file,
                                            selector=self.ref_selectors)
        if self.use_context:
            self.valid_das = self._load_contexts(self.valid_das,
                                                 valid_context_file)

        # reorder validation data for multiple references (see also _cut_valid_data)
        valid_size = len(self.valid_trees)
        if self.multiple_refs:
            num_refs, refs_stored = self._check_multiple_ref_type(valid_size)

            # serial: different instances next to each other, then synonymous in the same order
            if refs_stored == 'serial':
                valid_tree_chunks = [
                    chunk
                    for chunk in chunk_list(self.valid_trees, valid_size /
                                            num_refs)
                ]
                self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks]
                                    for i in xrange(valid_size / num_refs)]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[0:valid_size / num_refs]
            # parallel: synonymous instances next to each other
            elif refs_stored == 'parallel':
                self.valid_trees = [
                    chunk for chunk in chunk_list(self.valid_trees, num_refs)
                ]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[::num_refs]

        # no multiple references; make lists of size 1 to simplify working with the data
        else:
            self.valid_trees = [[tree] for tree in self.valid_trees]
def get_test_das():
    if DATASET_WEBNLG:
        if VALIDATION_NOT_TEST:
            return get_das_texts_from_webnlg(
                "WebNLG_Reader/data/webnlg/valid.json")[0]
        else:
            return get_das_texts_from_webnlg(
                "WebNLG_Reader/data/webnlg/test.json")[0]

    if VALIDATION_NOT_TEST:
        das_file = "tgen/e2e-challenge/input/devel-das.txt"
    else:
        das_file = "tgen/e2e-challenge/input/test-das.txt"

    das = read_das(das_file)
    return das
Esempio n. 11
0
    def _load_valid_data(self, valid_data_paths):
        """Load validation data from separate files (comma-separated list of files with DAs, trees,
        and optionally contexts is expected)."""
        # parse validation data file specification
        valid_data_paths = valid_data_paths.split(',')
        if len(valid_data_paths) == 3:  # with contexts (this does not determine if they're used)
            valid_das_file, valid_trees_file, valid_context_file = valid_data_paths
        else:
            valid_das_file, valid_trees_file = valid_data_paths

        # load the validation data
        log_info('Reading DAs from ' + valid_das_file + '...')
        self.valid_das = read_das(valid_das_file)
        self.valid_trees = self._load_trees(valid_trees_file, selector=self.ref_selectors)
        if self.use_context:
            self.valid_das = self._load_contexts(self.valid_das, valid_context_file)

        # reorder validation data for multiple references (see also _cut_valid_data)
        valid_size = len(self.valid_trees)
        if self.multiple_refs:
            num_refs, refs_stored = self._check_multiple_ref_type(valid_size)

            # serial: different instances next to each other, then synonymous in the same order
            if refs_stored == 'serial':
                valid_tree_chunks = [chunk for chunk in
                                     chunk_list(self.valid_trees, valid_size / num_refs)]
                self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks]
                                    for i in xrange(valid_size / num_refs)]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[0:valid_size / num_refs]
            # parallel: synonymous instances next to each other
            elif refs_stored == 'parallel':
                self.valid_trees = [chunk for chunk in chunk_list(self.valid_trees, num_refs)]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[::num_refs]

        # no multiple references; make lists of size 1 to simplify working with the data
        else:
            self.valid_trees = [[tree] for tree in self.valid_trees]
Esempio n. 12
0
    def train(self,
              das,
              trees,
              data_portion=1.0,
              valid_das=None,
              valid_trees=None):
        # read input from files or take it directly from parameters
        if not isinstance(das, list):
            log_info('Reading DAs from ' + das + '...')
            das = read_das(das)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_das = das[:train_size]

        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)
        log_info('Number of binary classes: %d.' %
                 len(self.da_vect.get_feature_names()))
Esempio n. 13
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser()

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
    log_info(tgen.get_slot_err_stats())

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector,
                                args.target_selector or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_trees)

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            write_tokens(gen_trees, args.output_file)
        else:
            write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language,
                                          args.target_selector or tgen.selector),
                         args.output_file)
Esempio n. 14
0
if len(files) != 2:
    sys.exit(
        'Usage: python inspect_data.py [-l lang] [-s selector] <trees.yaml> <das.txt>'
    )

language = 'en'
selector = ''

for opt, arg in opts:
    if opt == '-l':
        language = arg
    elif opt == '-s':
        selector = arg

trees = trees_from_doc(read_ttrees(files[0]), language, selector)
das = read_das(files[1])

# TREE SIZES

tree_sizes = defaultdict(int)
for tree in trees:
    tree_sizes[len(tree)] += 1

print "Tree sizes:\n==========="
for k, v in sorted(tree_sizes.items()):
    print k, "\t", v

# DAS -> NODES

das_for_nodes = {}
num_occ_nodes = defaultdict(int)
Esempio n. 15
0
opts, files = getopt(sys.argv[1:], 'l:s:')

if len(files) != 2:
    sys.exit('Usage: python inspect_data.py [-l lang] [-s selector] <trees.yaml> <das.txt>')

language = 'en'
selector = ''

for opt, arg in opts:
    if opt == '-l':
        language = arg
    elif opt == '-s':
        selector = arg

trees = trees_from_doc(read_ttrees(files[0]), language, selector)
das = read_das(files[1])


# TREE SIZES

tree_sizes = defaultdict(int)
for tree in trees:
    tree_sizes[len(tree)] += 1

print "Tree sizes:\n==========="
for k, v in sorted(tree_sizes.items()):
    print k, "\t", v

# DAS -> NODES

das_for_nodes = {}
Esempio n. 16
0
    def _init_training(self, das_file, ttree_file, data_portion, context_file,
                       validation_files):
        """Load training data, prepare batches, build the NN.

        @param das_file: training DAs (file path)
        @param ttree_file: training t-trees (file path)
        @param data_portion: portion of the data to be actually used for training
        @param context_file: training contexts (file path)
        @param validation_files: validation file paths (or None)
        """
        # read training data
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        trees = self._load_trees(ttree_file)
        if self.use_context:
            das = self._load_contexts(das, context_file)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # load separate validation data files...
        if validation_files:
            self._load_valid_data(validation_files)
        # ... or save part of the training data for validation:
        elif self.validation_size > 0:
            self._cut_valid_data(
            )  # will set train_trees, valid_trees, train_das, valid_das
        log_info('Using %d training, %d validation instances.' %
                 (len(self.train_das), len(self.valid_das)))

        # initialize embeddings
        if self.use_context:
            self.da_embs = ContextDAEmbeddingSeq2SeqExtract(cfg=self.cfg)
        else:
            self.da_embs = DAEmbeddingSeq2SeqExtract(cfg=self.cfg)
        if self.use_tokens:
            self.tree_embs = TokenEmbeddingSeq2SeqExtract(cfg=self.cfg)
        else:
            self.tree_embs = TreeEmbeddingSeq2SeqExtract(cfg=self.cfg)

        self.da_dict_size = self.da_embs.init_dict(self.train_das)
        self.tree_dict_size = self.tree_embs.init_dict(self.train_trees)
        self.max_tree_len = self.tree_embs.get_embeddings_shape()[0]
        self.max_da_len = self.da_embs.get_embeddings_shape()[0]

        # prepare training batches
        self.train_enc = [
            cut_batch_into_steps(b) for b in grouper(
                [self.da_embs.get_embeddings(da)
                 for da in self.train_das], self.batch_size, None)
        ]
        self.train_dec = [
            cut_batch_into_steps(b) for b in grouper([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ], self.batch_size, None)
        ]

        # train the classifier for filtering n-best lists
        if self.classif_filter:
            self.classif_filter.train(self.train_das,
                                      self.train_trees,
                                      valid_das=self.valid_das,
                                      valid_trees=self.valid_trees)
            self.classif_filter.restore_checkpoint(
            )  # restore the best performance on devel data

        # convert validation data to flat trees to enable F1 measuring
        if self.validation_size > 0 and self.use_tokens:
            self.valid_trees = self._valid_data_to_flat_trees(self.valid_trees)

        # initialize top costs
        self.top_k_costs = [float('nan')] * self.top_k
        self.checkpoint_path = None

        # build the NN
        self._init_neural_network()

        # initialize the NN variables
        self.session.run(tf.initialize_all_variables())
Esempio n. 17
0
#!/usr/bin/env python


from flect.config import Config
from tgen.features import Features
from tgen.futil import trees_from_doc, read_ttrees, read_das
import sys
import timeit
import datetime

if len(sys.argv[1:]) != 3:
    sys.exit('Usage: ./bench_feats.py features_cfg.py trees.yaml.gz das.txt')

print >> sys.stderr, 'Loading...'

cfg = Config(sys.argv[1])
trees = trees_from_doc(read_ttrees(sys.argv[2]), 'en', '')
das = read_das(sys.argv[3])

feats = Features(cfg['features'])

def test_func():
    for tree, da in zip(trees, das):
        feats.get_features(tree, {'da': da})
    

print >> sys.stderr, 'Running test...'
secs = timeit.timeit('test_func()', setup='from __main__ import test_func', number=10)
td = datetime.timedelta(seconds=secs)
print >> sys.stderr, 'Time taken: %s' % str(td)
Esempio n. 18
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser()

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instsructions, for tokens only)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]

    # prepare evaluation
    if args.eval_file is None or args.eval_file.endswith('.txt'):  # just tokens
        gen_doc = []
    else:  # Trees: depending on PyTreex
        from pytreex.core.document import Document
        eval_doc = read_ttrees(args.eval_file)
        if args.ref_selector == args.target_selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    if args.eval_file:
        tgen.init_slot_err_stats()

    # generate
    log_info('Generating...')
    tgen.selector = args.target_selector  # override target selector for generation
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        tgen.generate_tree(da, gen_doc)

    # evaluate
    if args.eval_file is not None:
        log_info(tgen.get_slot_err_stats())
        # evaluate the generated tokens (F1 and BLEU scores)
        if args.eval_file.endswith('.txt'):
            lexicalize_tokens(gen_doc, lexicalization_from_doc(args.abstr_file))
            eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_doc)
        # evaluate the generated trees against golden trees
        else:
            eval_trees(das,
                       ttrees_from_doc(eval_doc, tgen.language, args.ref_selector),
                       ttrees_from_doc(gen_doc, tgen.language, args.target_selector),
                       eval_doc, tgen.language, tgen.selector)

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            write_tokens(gen_doc, args.output_file)
        else:
            write_ttrees(gen_doc, args.output_file)
Esempio n. 19
0
def asearch_gen(args):
    """A*search generation"""
    from pytreex.core.document import Document

    opts, files = getopt(args, 'e:d:w:c:s:')
    eval_file = None
    fname_ttrees_out = None
    cfg_file = None
    eval_selector = ''

    for opt, arg in opts:
        if opt == '-e':
            eval_file = arg
        elif opt == '-s':
            eval_selector = arg
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-w':
            fname_ttrees_out = arg
        elif opt == '-c':
            cfg_file = arg

    if len(files) != 3:
        sys.exit('Invalid arguments.\n' + __doc__)
    fname_cand_model, fname_rank_model, fname_da_test = files

    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
    ranker = PerceptronRanker.load_from_file(fname_rank_model)
    cfg = Config(cfg_file) if cfg_file else {}
    cfg.update({'candgen': candgen, 'ranker': ranker})
    tgen = ASearchPlanner(cfg)

    log_info('Generating...')
    das = read_das(fname_da_test)

    if eval_file is None:
        gen_doc = Document()
    else:
        eval_doc = read_ttrees(eval_file)
        if eval_selector == tgen.selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    # generate and evaluate
    if eval_file is not None:
        # generate + analyze open&close lists
        lists_analyzer = ASearchListsAnalyzer()
        for num, (da, gold_tree) in enumerate(zip(
                das, trees_from_doc(eval_doc, tgen.language, eval_selector)),
                                              start=1):
            log_debug("\n\nTREE No. %03d" % num)
            gen_tree = tgen.generate_tree(da, gen_doc)
            lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list)
            if gen_tree != gold_tree:
                log_debug("\nDIFFING TREES:\n" +
                          tgen.ranker.diffing_trees_with_scores(
                              da, gold_tree, gen_tree) + "\n")

        log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' %
                 lists_analyzer.stats())

        # evaluate the generated trees against golden trees
        eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
        gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)

        log_info('Evaluating...')
        evaler = Evaluator()
        for eval_bundle, eval_ttree, gen_ttree, da in zip(
                eval_doc.bundles, eval_ttrees, gen_ttrees, das):
            # add some stats about the tree directly into the output file
            add_bundle_text(
                eval_bundle, tgen.language, tgen.selector + 'Xscore',
                "P: %.4f R: %.4f F1: %.4f" %
                p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))

            # collect overall stats
            evaler.append(eval_ttree, gen_ttree,
                          ranker.score(TreeData.from_ttree(eval_ttree), da),
                          ranker.score(TreeData.from_ttree(gen_ttree), da))
        # print overall stats
        log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" %
                 evaler.p_r_f1())
        log_info("DEP  precision: %.4f, Recall: %.4f, F1: %.4f" %
                 evaler.p_r_f1(EvalTypes.DEP))
        log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" %
                 evaler.size_stats())
        log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" %
                 evaler.score_stats())
        log_info(
            "Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
            evaler.common_substruct_stats())
    # just generate
    else:
        for da in das:
            tgen.generate_tree(da, gen_doc)

    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Esempio n. 20
0
    def _init_training(self, das, trees, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (0.0-1.0)
        """
        # read input from files or take it directly from parameters
        if not isinstance(das, list):
            log_info('Reading DAs from ' + das + '...')
            das = read_das(das)
        if not isinstance(trees, list):
            log_info('Reading t-trees from ' + trees + '...')
            ttree_doc = read_ttrees(trees)
            if self.mode == 'tokens':
                tokens = tokens_from_doc(ttree_doc, self.language,
                                         self.selector)
                trees = self._tokens_to_flat_trees(tokens)
            elif self.mode == 'tagged_lemmas':
                tls = tagged_lemmas_from_doc(ttree_doc, self.language,
                                             self.selector)
                trees = self._tokens_to_flat_trees(tls, use_tags=True)
            else:
                trees = trees_from_doc(ttree_doc, self.language, self.selector)
        elif self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(
                trees, use_tags=self.mode == 'tagged_lemmas')

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # ignore contexts, if they are contained in the DAs
        if isinstance(self.train_das[0], tuple):
            self.train_das = [da for (context, da) in self.train_das]
        # delexicalize if DAs are lexicalized and we don't want that
        if self.delex_slots:
            self.train_das = [
                da.get_delexicalized(self.delex_slots) for da in self.train_das
            ]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False,
                                            binarize_numeric=True)
            self.X = [
                self.tree_feats.get_features(tree, {})
                for tree in self.train_trees
            ]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)
        log_info('Number of binary classes: %d.' %
                 len(self.da_vect.get_feature_names()))

        # initialize I/O shapes
        if not self.tree_embs:
            self.input_shape = list(self.X[0].shape)
        else:
            self.input_shape = self.tree_embs.get_embeddings_shape()
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
        # initialize the NN variables
        self.session.run(tf.global_variables_initializer())
Esempio n. 21
0
    def _init_training(self, das_file, ttree_file, data_portion, context_file, validation_files):
        """Load training data, prepare batches, build the NN.

        @param das_file: training DAs (file path)
        @param ttree_file: training t-trees (file path)
        @param data_portion: portion of the data to be actually used for training
        @param context_file: training contexts (file path)
        @param validation_files: validation file paths (or None)
        """
        # read training data
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        trees = self._load_trees(ttree_file)
        if self.use_context:
            das = self._load_contexts(das, context_file)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # load separate validation data files...
        if validation_files:
            self._load_valid_data(validation_files)
        # ... or save part of the training data for validation:
        elif self.validation_size > 0:
            self._cut_valid_data()  # will set train_trees, valid_trees, train_das, valid_das
        log_info('Using %d training, %d validation instances.' %
                 (len(self.train_das), len(self.valid_das)))

        # initialize embeddings
        if self.use_context:
            self.da_embs = ContextDAEmbeddingSeq2SeqExtract(cfg=self.cfg)
        else:
            self.da_embs = DAEmbeddingSeq2SeqExtract(cfg=self.cfg)
        if self.use_tokens:
            self.tree_embs = TokenEmbeddingSeq2SeqExtract(cfg=self.cfg)
        else:
            self.tree_embs = TreeEmbeddingSeq2SeqExtract(cfg=self.cfg)

        self.da_dict_size = self.da_embs.init_dict(self.train_das)
        self.tree_dict_size = self.tree_embs.init_dict(self.train_trees)
        self.max_tree_len = self.tree_embs.get_embeddings_shape()[0]
        self.max_da_len = self.da_embs.get_embeddings_shape()[0]

        # prepare training batches
        self.train_enc = [cut_batch_into_steps(b)
                          for b in grouper([self.da_embs.get_embeddings(da)
                                            for da in self.train_das],
                                           self.batch_size, None)]
        self.train_dec = [cut_batch_into_steps(b)
                          for b in grouper([self.tree_embs.get_embeddings(tree)
                                            for tree in self.train_trees],
                                           self.batch_size, None)]

        # train the classifier for filtering n-best lists
        if self.classif_filter:
            self.classif_filter.train(self.train_das, self.train_trees,
                                      valid_das=self.valid_das,
                                      valid_trees=self.valid_trees)
            self.classif_filter.restore_checkpoint()  # restore the best performance on devel data

        # convert validation data to flat trees to enable F1 measuring
        if self.validation_size > 0 and self.use_tokens:
            self.valid_trees = self._valid_data_to_flat_trees(self.valid_trees)

        # initialize top costs
        self.top_k_costs = [float('nan')] * self.top_k
        self.checkpoint_path = None

        # build the NN
        self._init_neural_network()

        # initialize the NN variables
        self.session.run(tf.initialize_all_variables())
Esempio n. 22
0
from flect.config import Config
from tgen.features import Features
from tgen.futil import trees_from_doc, read_ttrees, read_das
import sys
import timeit
import datetime

if len(sys.argv[1:]) != 3:
    sys.exit('Usage: ./bench_feats.py features_cfg.py trees.yaml.gz das.txt')

print >> sys.stderr, 'Loading...'

cfg = Config(sys.argv[1])
trees = trees_from_doc(read_ttrees(sys.argv[2]), 'en', '')
das = read_das(sys.argv[3])

feats = Features(cfg['features'])


def test_func():
    for tree, da in zip(trees, das):
        feats.get_features(tree, {'da': da})


print >> sys.stderr, 'Running test...'
secs = timeit.timeit('test_func()',
                     setup='from __main__ import test_func',
                     number=10)
td = datetime.timedelta(seconds=secs)
print >> sys.stderr, 'Time taken: %s' % str(td)
Esempio n. 23
0
def asearch_gen(args):
    """A*search generation"""
    from pytreex.core.document import Document

    opts, files = getopt(args, 'e:d:w:c:s:')
    eval_file = None
    fname_ttrees_out = None
    cfg_file = None
    eval_selector = ''

    for opt, arg in opts:
        if opt == '-e':
            eval_file = arg
        elif opt == '-s':
            eval_selector = arg
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-w':
            fname_ttrees_out = arg
        elif opt == '-c':
            cfg_file = arg

    if len(files) != 3:
        sys.exit('Invalid arguments.\n' + __doc__)
    fname_cand_model, fname_rank_model, fname_da_test = files

    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
    ranker = PerceptronRanker.load_from_file(fname_rank_model)
    cfg = Config(cfg_file) if cfg_file else {}
    cfg.update({'candgen': candgen, 'ranker': ranker})
    tgen = ASearchPlanner(cfg)

    log_info('Generating...')
    das = read_das(fname_da_test)

    if eval_file is None:
        gen_doc = Document()
    else:
        eval_doc = read_ttrees(eval_file)
        if eval_selector == tgen.selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    # generate and evaluate
    if eval_file is not None:
        # generate + analyze open&close lists
        lists_analyzer = ASearchListsAnalyzer()
        for num, (da, gold_tree) in enumerate(zip(das,
                                                  trees_from_doc(eval_doc, tgen.language, eval_selector)),
                                              start=1):
            log_debug("\n\nTREE No. %03d" % num)
            gen_tree = tgen.generate_tree(da, gen_doc)
            lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list)
            if gen_tree != gold_tree:
                log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n")

        log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats())

        # evaluate the generated trees against golden trees
        eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
        gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)

        log_info('Evaluating...')
        evaler = Evaluator()
        for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das):
            # add some stats about the tree directly into the output file
            add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore',
                            "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))

            # collect overall stats
            evaler.append(eval_ttree,
                          gen_ttree,
                          ranker.score(TreeData.from_ttree(eval_ttree), da),
                          ranker.score(TreeData.from_ttree(gen_ttree), da))
        # print overall stats
        log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1())
        log_info("DEP  precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP))
        log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats())
        log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats())
        log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
                 evaler.common_substruct_stats())
    # just generate
    else:
        for da in das:
            tgen.generate_tree(da, gen_doc)

    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Esempio n. 24
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files (DAs, contexts)
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]
    elif tgen.use_context or tgen.context_bleu_weight:
        log_warn('Generator is trained to use context. ' +
                 'Using empty contexts, expect lower performance.')
        das = [([], da) for da in das]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
        if num % 100 == 0:
            log_info("Generated tree %d" % num)
    log_info(tgen.get_slot_err_stats())

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector,
                                args.target_selector or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # we won't need contexts anymore, but we do need DAs
    if tgen.use_context or tgen.context_bleu_weight:
        das = [da for _, da in das]

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True),
                    [t.to_tok_list() for t in gen_trees])

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            gen_toks = [t.to_tok_list() for t in gen_trees]
            postprocess_tokens(gen_toks, das)
            write_tokens(gen_toks, args.output_file)
        else:
            write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language,
                                          args.target_selector or tgen.selector),
                         args.output_file)
Esempio n. 25
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""
    def write_trees_or_tokens(output_file, das, gen_trees, base_doc, language,
                              selector):
        """Decide to write t-trees or tokens based on the output file name."""
        if output_file.endswith('.txt'):
            gen_toks = [t.to_tok_list() for t in gen_trees]
            postprocess_tokens(gen_toks, das)
            write_tokens(gen_toks, output_file)
        else:
            write_ttrees(
                create_ttree_doc(gen_trees, base_doc, language, selector),
                output_file)

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-e',
                    '--eval-file',
                    type=str,
                    help='A ttree/text file for evaluation')
    ap.add_argument(
        '-a',
        '--abstr-file',
        type=str,
        help=
        'Lexicalization file (a.k.a. abstraction instructions, for postprocessing)'
    )
    ap.add_argument('-r',
                    '--ref-selector',
                    type=str,
                    default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument(
        '-t',
        '--target-selector',
        type=str,
        default='',
        help='Target selector for generated trees in the output file')
    ap.add_argument('-d',
                    '--debug-logfile',
                    type=str,
                    help='Debug output file name')
    ap.add_argument('-w',
                    '--output-file',
                    type=str,
                    help='Output tree/text file')
    ap.add_argument('-D',
                    '--delex-output-file',
                    type=str,
                    help='Output file for trees/text before lexicalization')
    ap.add_argument('-b',
                    '--beam-size',
                    type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c',
                    '--context-file',
                    type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file',
                    type=str,
                    help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files (DAs, contexts)
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn(
                'Generator is not trained to use context, ignoring context input file.'
            )
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]
    elif tgen.use_context or tgen.context_bleu_weight:
        log_warn('Generator is trained to use context. ' +
                 'Using empty contexts, expect lower performance.')
        das = [([], da) for da in das]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
        if num % 100 == 0:
            log_info("Generated tree %d" % num)
    log_info(tgen.get_slot_err_stats())

    if args.delex_output_file is not None:
        log_info('Writing delex output...')
        write_trees_or_tokens(args.delex_output_file, das, gen_trees, None,
                              tgen.language, args.target_selector
                              or tgen.selector)

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language,
                                args.ref_selector, args.target_selector
                                or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # we won't need contexts anymore, but we do need DAs
    if tgen.use_context or tgen.context_bleu_weight:
        das = [da for _, da in das]

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True),
                    [t.to_tok_list() for t in gen_trees])

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        write_trees_or_tokens(args.output_file, das, gen_trees, eval_doc,
                              tgen.language, args.target_selector
                              or tgen.selector)
Esempio n. 26
0
    def _init_training(self, das, trees, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (0.0-1.0)
        """
        # read input from files or take it directly from parameters
        if not isinstance(das, list):
            log_info('Reading DAs from ' + das + '...')
            das = read_das(das)
        if not isinstance(trees, list):
            log_info('Reading t-trees from ' + trees + '...')
            ttree_doc = read_ttrees(trees)
            if self.mode == 'tokens':
                tokens = tokens_from_doc(ttree_doc, self.language, self.selector)
                trees = self._tokens_to_flat_trees(tokens)
            elif self.mode == 'tagged_lemmas':
                tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector)
                trees = self._tokens_to_flat_trees(tls, use_tags=True)
            else:
                trees = trees_from_doc(ttree_doc, self.language, self.selector)
        elif self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas')

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # ignore contexts, if they are contained in the DAs
        if isinstance(self.train_das[0], tuple):
            self.train_das = [da for (context, da) in self.train_das]
        # delexicalize if DAs are lexicalized and we don't want that
        if self.delex_slots:
            self.train_das = [da.get_delexicalized(self.delex_slots) for da in self.train_das]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True)
            self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das]
        self.y = self.da_vect.fit_transform(self.y)
        log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names()))

        # initialize I/O shapes
        if not self.tree_embs:
            self.input_shape = list(self.X[0].shape)
        else:
            self.input_shape = self.tree_embs.get_embeddings_shape()
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
        # initialize the NN variables
        self.session.run(tf.global_variables_initializer())