Esempio n. 1
0
    def _beam_search(self, enc_inputs, da):
        """Run beam search decoding."""

        # true "batches" not implemented
        assert len(enc_inputs[0]) == 1

        # run greedy decoder for comparison (debugging purposes)
        log_debug("GREEDY DEC WOULD RETURN:\n" +
                  " ".join(self.tree_embs.ids_to_strings(
                      [out_tok[0] for out_tok in self._greedy_decoding(enc_inputs, None)[0]])))

        # initialize
        self._init_beam_search(enc_inputs)
        empty_tree_emb = self.tree_embs.get_embeddings(TreeData())
        dec_inputs = cut_batch_into_steps([empty_tree_emb])

        paths = [self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]])]

        # beam search steps
        for step in xrange(len(dec_inputs)):

            new_paths = []

            for path in paths:
                out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states)
                new_paths.extend(path.expand(self.beam_size, out_probs, st))

            def cmp_func(p, q):
                """Length-weighted comparison of two paths' logprobs."""
                return cmp(p.logprob / (len(p) ** self.length_norm_weight),
                           q.logprob / (len(q) ** self.length_norm_weight))

            paths = sorted(new_paths, cmp=cmp_func, reverse=True)[:self.beam_size]

            if all([p.dec_inputs[-1] == self.tree_embs.VOID for p in paths]):
                break  # stop decoding if we have reached the end in all paths

            log_debug(("\nBEAM SEARCH STEP %d\n" % step) +
                      "\n".join([("%f\t" % p.logprob) +
                                 " ".join(self.tree_embs.ids_to_strings([inp[0] for inp in p.dec_inputs]))
                                 for p in paths]) + "\n")

        # rerank paths by their distance to the input DA
        if self.classif_filter or self.context_bleu_weight:
            paths = self._rerank_paths(paths, da)

        # measure slot error on the top k paths
        if self.slot_err_stats:
            for path in paths[:self.sample_top_k]:
                self.slot_err_stats.append(
                        da, self.tree_embs.ids_to_strings([inp[0] for inp in path.dec_inputs]))

        # select the "best" path -- either the best, or one in top k
        if self.sample_top_k > 1:
            best_path = self._sample_path(paths[:self.sample_top_k])
        else:
            best_path = paths[0]

        # return just the best path (as token IDs)
        return np.array(best_path.dec_inputs)
Esempio n. 2
0
    def _beam_search(self, enc_inputs, da):
        """Run beam search decoding."""

        # true "batches" not implemented
        assert len(enc_inputs[0]) == 1

        # run greedy decoder for comparison (debugging purposes)
        log_debug("GREEDY DEC WOULD RETURN:\n" +
                  " ".join(self.tree_embs.ids_to_strings(
                      [out_tok[0] for out_tok in self._greedy_decoding(enc_inputs, None)[0]])))

        # initialize
        self._init_beam_search(enc_inputs)
        empty_tree_emb = self.tree_embs.get_embeddings(TreeData())
        dec_inputs = cut_batch_into_steps([empty_tree_emb])

        paths = [self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]])]

        # beam search steps
        for step in xrange(len(dec_inputs)):

            new_paths = []

            for path in paths:
                out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states)
                new_paths.extend(path.expand(self.beam_size, out_probs, st))

            def cmp_func(p, q):
                """Length-weighted comparison of two paths' logprobs."""
                return cmp(p.logprob / (len(p) ** self.length_norm_weight),
                           q.logprob / (len(q) ** self.length_norm_weight))

            paths = sorted(new_paths, cmp=cmp_func, reverse=True)[:self.beam_size]

            if all([p.dec_inputs[-1] == self.tree_embs.VOID for p in paths]):
                break  # stop decoding if we have reached the end in all paths

            log_debug(("\nBEAM SEARCH STEP %d\n" % step) +
                      "\n".join([("%f\t" % p.logprob) +
                                 " ".join(self.tree_embs.ids_to_strings([inp[0] for inp in p.dec_inputs]))
                                 for p in paths]) + "\n")

        # rerank paths by their distance to the input DA
        if self.classif_filter or self.context_bleu_weight:
            paths = self._rerank_paths(paths, da)

        # measure slot error on the top k paths
        if self.slot_err_stats:
            for path in paths[:self.sample_top_k]:
                self.slot_err_stats.append(
                        da, self.tree_embs.ids_to_strings([inp[0] for inp in path.dec_inputs]))

        # select the "best" path -- either the best, or one in top k
        if self.sample_top_k > 1:
            best_path = self._sample_path(paths[:self.sample_top_k])
        else:
            best_path = paths[0]

        # return just the best path (as token IDs)
        return np.array(best_path.dec_inputs)
Esempio n. 3
0
    def get_all_successors(self, cand_tree):
        """Get all possible successors of a candidate tree, given CDFS and node number limits.

        NB: This assumes projectivity (will never create a non-projective tree).

        @param cand_tree: The current candidate tree to be expanded
        """
        # TODO possibly avoid creating TreeNode instances for iterating
        nodes = TreeNode(cand_tree).get_descendants(add_self=1, ordered=1)
        nodes_on_level = defaultdict(int)
        res = []
        if self.cur_limits is not None:
            # stop if maximum number of nodes is reached
            if len(nodes) >= self.cur_limits['total']:
                return []
            # remember number of nodes on all levels
            for node in nodes:
                nodes_on_level[node.get_depth()] += 1

        # try adding one node to all possible places
        for node_num, node in enumerate(nodes):
            # skip nodes that can't have more children
            parent_id = self._parent_node_id(node)
            if (len(node.get_children()) >= self.max_children.get(parent_id, 0) or
                    parent_id not in self.cur_cdfs):
                continue
            # skip nodes above child_depth levels where the maximum number of nodes has been reached
            if self.cur_limits is not None:
                child_depth = node.get_depth() + 1
                if nodes_on_level[child_depth] >= self.cur_limits[child_depth]:
                    continue
            # try all formeme/t-lemma/direction variants of a new child under the given parent node
            for formeme, t_lemma, right in map(lambda item: item[0], self.cur_cdfs[parent_id]):
                # place the child directly following/preceding the parent
                succ_tree = cand_tree.clone()
                succ_tree.create_child(node_num, right, NodeData(t_lemma, formeme))
                res.append(succ_tree)
                # if the parent already has some left/right children, try to place the new node
                # in all possible positions before/after their subtrees (for left/right child,
                # respectively)
                children_idxs = cand_tree.children_idxs(node_num, left_only=not right, right_only=right)
                for child_idx in children_idxs:
                    succ_tree = cand_tree.clone()
                    subtree_bound = succ_tree.subtree_bound(child_idx, right)
                    succ_tree.create_child(node_num, subtree_bound + (1 if right else 0),
                                           NodeData(t_lemma, formeme))
                    res.append(succ_tree)

        # if we have the tree classifier available, discard all successors that talk about something
        # not present in the current DA
        if self.classif and res:
            orig_len = len(res)
            is_subset = self.classif.is_subset_of_cur_da(res)
            res = [tree for tree, is_sub in zip(res, is_subset) if is_sub]
            final_len = len(res)
            if orig_len > final_len:
                log_debug('Tree classification reduced successors %d -> %d' % (orig_len, final_len))
        # return all created successors
        return res
Esempio n. 4
0
    def _init_training(self, das_file, ttree_file, data_portion):
        # load data, determine number of features etc. etc.
        super(SimpleNNRanker, self)._init_training(das_file, ttree_file, data_portion)

        self._init_neural_network()

        self.w_after_iter = []
        self.update_weights_sum()

        log_debug('\n***\nINIT:')
        log_debug(self._feat_val_str())
        log_info('Training ...')
Esempio n. 5
0
    def _init_training(self, das_file, ttree_file, data_portion):
        # load data, determine number of features etc. etc.
        super(SimpleNNRanker, self)._init_training(das_file, ttree_file, data_portion)

        self._init_neural_network()

        self.w_after_iter = []
        self.update_weights_sum()

        log_debug('\n***\nINIT:')
        log_debug(self._feat_val_str())
        log_info('Training ...')
Esempio n. 6
0
    def lexicalize(self, gen_trees, abst_file):
        """Lexicalize nodes in the generated trees (which may represent trees, tokens, or tagged lemmas).
        Expects lexicalization file (and surface forms file) to be loaded in the Lexicalizer object,
        otherwise nothing will happen. The actual operation depends on the generator mode.

        @param gen_trees: list of TreeData objects representing generated trees/tokens/tagged lemmas
        @param abst_file: abstraction/delexicalization instructions file path
        @return: None
        """
        abstss = smart_load_absts(abst_file, len(gen_trees))
        for sent_no, (tree, absts) in enumerate(zip(gen_trees, abstss)):
            log_debug("Lexicalizing sentence %d: %s" % ((sent_no + 1), str(tree)))
            sent = self._tree_to_sentence(tree)
            log_debug(str(sent))
            for idx, tok in enumerate(sent):
                if tok and tok.startswith('X-'):  # we would like to lexicalize
                    slot = tok[2:]
                    # check if we have a value to substitute; if yes, do it
                    abst = self._first_abst(absts, slot)
                    if abst:
                        # tagged lemmas: one token with appropriate value
                        if self.mode == 'tagged_lemmas':
                            tag = sent[idx+1] if idx < len(sent) - 1 else None
                            val = self.get_surface_form(sent, idx, slot, abst.value, tag=tag)
                            tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x')
                        # trees: one node with appropriate value, keep formeme
                        elif self.mode == 'trees':
                            formeme = sent[idx+1] if idx < len(sent) - 1 else None
                            val = self.get_surface_form(sent, idx, slot, abst.value,
                                                        formeme=formeme)
                            tree.nodes[old_div(idx,2)+1] = NodeData(t_lemma=val,
                                                           formeme=tree[old_div(idx,2)+1].formeme)
                        # tokens: one token with all words from the value (postprocessed below)
                        else:
                            val = self.get_surface_form(sent, idx, slot, abst.value)
                            tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x')
                        sent[idx] = val  # save value to be used in LM next time
            # postprocess tokens (split multi-word nodes)
            if self.mode == 'tokens':
                idx = 1
                while idx < len(tree):
                    if ' ' in tree[idx].t_lemma:
                        value = tree[idx].t_lemma
                        tree.remove_node(idx)
                        for shift, tok in enumerate(value.split(' ')):
                            tree.create_child(0, idx + shift,
                                              NodeData(t_lemma=tok, formeme='x'))
                        idx += shift
                    idx += 1
Esempio n. 7
0
    def _rerank_paths(self, paths, da):
        """Rerank the n-best decoded paths according to the reranking classifier and/or
        BLEU against context."""

        trees = [
            self.tree_embs.ids_to_tree(
                np.array(path.dec_inputs).transpose()[0]) for path in paths
        ]

        # rerank using BLEU against context if set to do so
        if self.context_bleu_weight:
            bm = BLEUMeasure(max_ngram=2)
            bleus = []
            for path, tree in zip(paths, trees):
                bm.reset()
                bm.append([(n.t_lemma, None) for n in tree.nodes[1:]], [da[0]])
                bleu = (bm.ngram_precision() if self.context_bleu_metric
                        == 'ngram_prec' else bm.bleu())
                bleus.append(bleu)
                path.logprob += self.context_bleu_weight * bleu

            log_debug(("BLEU for context: %s\n\n" %
                       " ".join([form for form, _ in da[0]])) +
                      "\n".join([("%.5f\t" % b) +
                                 " ".join([n.t_lemma for n in t.nodes[1:]])
                                 for b, t in zip(bleus, trees)]))

        # add distances to logprob so that non-fitting will be heavily penalized
        if self.classif_filter:
            self.classif_filter.init_run(da)
            fits = self.classif_filter.dist_to_cur_da(trees)
            for path, fit in zip(paths, fits):
                path.logprob -= self.misfit_penalty * fit

            log_debug(("Misfits for DA: %s\n\n" % str(da)) +
                      "\n".join([("%.5f\t" % fit) + " ".join(
                          [unicode(n.t_lemma) for n in tree.nodes[1:]])
                                 for fit, tree in zip(fits, trees)]))

        # adjust paths for length (if set to do so)
        if self.length_norm_weight:
            for path in paths:
                path.logprob /= len(path)**self.length_norm_weight

        return sorted(paths,
                      cmp=lambda p, q: cmp(p.logprob, q.logprob),
                      reverse=True)
Esempio n. 8
0
    def generate_tree(self, da, gen_doc=None):
        """Generate one tree, saving it into the document provided (if applicable).

        @param da: the input DA
        @param gen_doc: the document where the tree should be saved (defaults to None)
        """
        # generate the tree
        log_debug("GENERATE TREE FOR DA: " + unicode(da))
        tree = self.process_das([da])[0]
        log_debug("RESULT: %s" % unicode(tree))
        # append the tree to a t-tree document, if requested
        if gen_doc:
            zone = self.get_target_zone(gen_doc)
            zone.ttree = tree.create_ttree()
            zone.sentence = unicode(da)
        # return the result
        return tree
Esempio n. 9
0
    def _rerank_paths(self, paths, da):
        """Rerank the n-best decoded paths according to the reranking classifier and/or
        BLEU against context."""

        trees = [self.tree_embs.ids_to_tree(np.array(path.dec_inputs).transpose()[0])
                 for path in paths]

        # rerank using BLEU against context if set to do so
        if self.context_bleu_weight:
            bm = BLEUMeasure(max_ngram=2)
            bleus = []
            for path, tree in zip(paths, trees):
                bm.reset()
                bm.append([(n.t_lemma, None) for n in tree.nodes[1:]], [da[0]])
                bleu = (bm.ngram_precision()
                        if self.context_bleu_metric == 'ngram_prec'
                        else bm.bleu())
                bleus.append(bleu)
                path.logprob += self.context_bleu_weight * bleu

            log_debug(("BLEU for context: %s\n\n" % " ".join([form for form, _ in da[0]])) +
                      "\n".join([("%.5f\t" % b) + " ".join([n.t_lemma for n in t.nodes[1:]])
                                 for b, t in zip(bleus, trees)]))

        # add distances to logprob so that non-fitting will be heavily penalized
        if self.classif_filter:
            self.classif_filter.init_run(da)
            fits = self.classif_filter.dist_to_cur_da(trees)
            for path, fit in zip(paths, fits):
                path.logprob -= self.misfit_penalty * fit

            log_debug(("Misfits for DA: %s\n\n" % str(da)) +
                      "\n".join([("%.5f\t" % fit) +
                                 " ".join([unicode(n.t_lemma) for n in tree.nodes[1:]])
                                 for fit, tree in zip(fits, trees)]))

        # adjust paths for length (if set to do so)
        if self.length_norm_weight:
            for path in paths:
                path.logprob /= len(path) ** self.length_norm_weight

        return sorted(paths, cmp=lambda p, q: cmp(p.logprob, q.logprob), reverse=True)
Esempio n. 10
0
    def _update_nn(self, bad_feats, good_feats, rate):
        """Changing the NN update call to support arrays of parameters."""
        # TODO: this is just adding another dimension to fit the parallelized scoring
        # (even if updates are not parallelized). Make it nicer.
        bad_feats = ([bad_feats[0]], [bad_feats[1]])
        good_feats = ([good_feats[0]], [good_feats[1]])

        cost_gcost = self.nn.update(*(bad_feats + good_feats + (rate,)))
        log_debug('Cost:' + str(cost_gcost[0]))
        param_vals = [param.get_value() for param in self.nn.params]
        log_debug('Param norms : ' + str(self._l2s(param_vals)))
        log_debug('Gparam norms: ' + str(self._l2s(cost_gcost[1:])))
        l1_params = param_vals[2]
        log_debug('Layer 1 parts :' + str(self._l2s([l1_params[0:100, :], l1_params[100:200, :],
                                                    l1_params[200:350, :], l1_params[350:500, :],
                                                    l1_params[500:, :]])))
        l1_gparams = cost_gcost[3]
        log_debug('Layer 1 gparts:' + str(self._l2s([l1_gparams[0:100, :], l1_gparams[100:200, :],
                                                    l1_gparams[200:350, :], l1_gparams[350:500, :],
                                                    l1_gparams[500:, :]])))
Esempio n. 11
0
 def get_surface_form(self, sentence, pos, possible_forms):
     log_debug("Pos: %d, forms: %s" %
               (pos, unicode(", ".join(possible_forms))))
     # get unnormalized scores for the whole vocabulary
     if pos >= self.max_sent_len:  # don't use whole sentence if it's too long
         pos -= pos - self.max_sent_len + 1
         sentence = sentence[pos - self.max_sent_len + 1:]
     inputs = np.array([self._sent_to_ids(sentence)[:-1]], dtype=np.int32)
     logits = self.session.run([self._logits], {self._inputs: inputs})
     # pick out scores for possible forms
     scores = [
         logits[0][pos][self.vocab.get(form.lower(),
                                       self.vocab.get('<UNK>'))]
         for form in possible_forms
     ]
     probs = softmax(scores)
     log_debug("Vocab: %s" % unicode(", ".join([
         unicode(self.vocab.get(form.lower(), self.vocab.get('<UNK>')))
         for f in possible_forms
     ])))
     log_debug("Scores: %s, Probs: %s" % (unicode(", ".join(
         ["%.3f" % s
          for s in scores])), unicode(", ".join(["%.3f" % p
                                                 for p in probs]))))
     # sample from the prob. dist.
     if self._sample:
         return np.random.choice(possible_forms, p=probs)
     # get just the most probable option
     max_idx, _ = max(enumerate(probs), key=operator.itemgetter(1))
     return possible_forms[max_idx]
Esempio n. 12
0
    def generate_tree(self, da, gen_doc=None):
        """Generate one tree, saving it into the document provided (if applicable).

        @param da: the input DA
        @param gen_doc: the document where the tree should be saved (defaults to None)
        """
        # generate the tree
        log_debug("GENERATE TREE FOR DA: " + unicode(da))
        tree = self.process_das([da])[0]
        log_debug("RESULT: %s" % unicode(tree))
        # if requested, append the result to the "document"
        # just lists (generated tokens only, disregarding syntax; keep None for POS tags)
        if isinstance(gen_doc, list):
            # ignore tree technical root, take just "lemmas"
            gen_doc.append([(n.t_lemma, None) for n in tree.nodes[1:]])
        # full Pytreex documents (full trees)
        elif gen_doc:
            zone = self.get_target_zone(gen_doc)
            zone.ttree = tree.create_ttree()
            zone.sentence = unicode(da)
        # return the result
        return tree
Esempio n. 13
0
    def generate_tree(self, da, gen_doc=None):
        """Generate one tree, saving it into the document provided (if applicable).

        @param da: the input DA
        @param gen_doc: the document where the tree should be saved (defaults to None)
        """
        # generate the tree
        log_debug("GENERATE TREE FOR DA: " + unicode(da))
        tree = self.process_das([da])[0]
        log_debug("RESULT: %s" % unicode(tree))
        # if requested, append the result to the "document"
        # just lists (generated tokens only, disregarding syntax; keep None for POS tags)
        if isinstance(gen_doc, list):
            # ignore tree technical root, take just "lemmas"
            gen_doc.append([(n.t_lemma, None) for n in tree.nodes[1:]])
        # full Pytreex documents (full trees)
        elif gen_doc:
            zone = self.get_target_zone(gen_doc)
            zone.ttree = tree.create_ttree()
            zone.sentence = unicode(da)
        # return the result
        return tree
Esempio n. 14
0
    def _check_pending_request(self, sc, job_no, req):
        """Check whether the given request has finished (i.e., job is loaded or job has
        processed the given data portion.

        If the request is finished, the worker that processed it is moved to the pool
        of free services.

        @param iter_no: current iteration number (for logging)
        @param sc: a ServiceConn object that stores the worker connection parameters
        @param job_no: current job number (is None for jobs loading)
        @param req: the request itself

        @return: the value returned by the finished data processing request, or None \
            (for loading requests or unfinished requests)
        """
        result = None
        if job_no is not None:
            log_debug('Checking %d' % job_no)

        # checking if the request has finished
        if req.ready:
            if job_no is not None:
                log_debug('Ready %d' % job_no)
                log_info('Retrieved finished request %d' % job_no)
            if req.error:
                log_info(
                    'Error found on request: job #%d, worker %s:%d' %
                    (job_no if job_no is not None else -1, sc.host, sc.port))
            result = req.value

            # remove from list of pending requests
            # TODO return to pool of free requests (but needs to store the results somewhere)
            self.pending_requests.remove((sc, job_no, req))
            if job_no is None:
                self.free_services.append(sc)

        return result
Esempio n. 15
0
    def _update_nn(self, bad_feats, good_feats, rate):
        """Changing the NN update call to support arrays of parameters."""
        # TODO: this is just adding another dimension to fit the parallelized scoring
        # (even if updates are not parallelized). Make it nicer.
        bad_feats = ([bad_feats[0]], [bad_feats[1]])
        good_feats = ([good_feats[0]], [good_feats[1]])

        cost_gcost = self.nn.update(*(bad_feats + good_feats + (rate,)))
        log_debug('Cost:' + str(cost_gcost[0]))
        param_vals = [param.get_value() for param in self.nn.params]
        log_debug('Param norms : ' + str(self._l2s(param_vals)))
        log_debug('Gparam norms: ' + str(self._l2s(cost_gcost[1:])))
Esempio n. 16
0
 def append(self, gold_tree, open_list, close_list):
     """Analyze the open and close lists of a generator for the presence of the gold-standard
     tree and add the results to statistics."""
     self.total += 1
     best_tree = close_list.peek()[0]
     if gold_tree == best_tree:
         self.gold_best += 1
         log_debug('GOLD TREE IS BEST')
     if gold_tree in close_list:
         self.gold_on_close += 1
         log_debug('GOLD TREE IS ON CLOSE LIST')
     if gold_tree in open_list:
         self.gold_on_open += 1
         log_debug('GOLD TREE IS ON OPEN LIST')
Esempio n. 17
0
 def append(self, gold_tree, open_list, close_list):
     """Analyze the open and close lists of a generator for the presence of the gold-standard
     tree and add the results to statistics."""
     self.total += 1
     best_tree = close_list.peek()[0]
     if gold_tree == best_tree:
         self.gold_best += 1
         log_debug('GOLD TREE IS BEST')
     if gold_tree in close_list:
         self.gold_on_close += 1
         log_debug('GOLD TREE IS ON CLOSE LIST')
     if gold_tree in open_list:
         self.gold_on_open += 1
         log_debug('GOLD TREE IS ON OPEN LIST')
Esempio n. 18
0
    def _training_pass(self, pass_no):
        """Perform one training pass through the whole training data, print statistics."""

        pass_start_time = time.time()

        log_debug('\n***\nTR %05d:' % pass_no)
        log_debug("Train order: " + str(self.train_order))

        pass_cost = 0
        pass_diff = 0

        for tree_nos in self.batches():

            log_debug('TREE-NOS: ' + str(tree_nos))
            log_debug("\n".join(
                unicode(self.train_trees[i]) + "\n" +
                unicode(self.train_das[i]) for i in tree_nos))
            log_debug('Y: ' + str(self.y[tree_nos]))

            results = self.classif.classif(self.X[tree_nos])
            cost_gcost = self.classif.update(self.X[tree_nos],
                                             self.y[tree_nos], self.alpha)
            bin_result = np.array([[1. if r > 0.5 else 0. for r in result]
                                   for result in results])

            log_debug('R: ' + str(bin_result))
            log_debug('COST: %f' % cost_gcost[0])
            log_debug('DIFF: %d' %
                      np.sum(np.abs(self.y[tree_nos] - bin_result)))

            pass_cost += cost_gcost[0]
            pass_diff += np.sum(np.abs(self.y[tree_nos] - bin_result))

        # print and return statistics
        self._print_pass_stats(
            pass_no,
            datetime.timedelta(seconds=(time.time() - pass_start_time)),
            pass_cost, pass_diff)
Esempio n. 19
0
    def get_all_successors(self, cand_tree):
        """Get all possible successors of a candidate tree, given CDFS and node number limits.

        NB: This assumes projectivity (will never create a non-projective tree).

        @param cand_tree: The current candidate tree to be expanded
        """
        # TODO possibly avoid creating TreeNode instances for iterating
        nodes = TreeNode(cand_tree).get_descendants(add_self=1, ordered=1)
        nodes_on_level = defaultdict(int)
        res = []
        if self.cur_limits is not None:
            # stop if maximum number of nodes is reached
            if len(nodes) >= self.cur_limits['total']:
                return []
            # remember number of nodes on all levels
            for node in nodes:
                nodes_on_level[node.get_depth()] += 1

        # try adding one node to all possible places
        for node_num, node in enumerate(nodes):
            # skip nodes that can't have more children
            parent_id = self._parent_node_id(node)
            if (len(node.get_children()) >= self.max_children.get(
                    parent_id, 0) or parent_id not in self.cur_cdfs):
                continue
            # skip nodes above child_depth levels where the maximum number of nodes has been reached
            if self.cur_limits is not None:
                child_depth = node.get_depth() + 1
                if nodes_on_level[child_depth] >= self.cur_limits[child_depth]:
                    continue
            # try all formeme/t-lemma/direction variants of a new child under the given parent node
            for formeme, t_lemma, right in [
                    item[0] for item in self.cur_cdfs[parent_id]
            ]:
                # place the child directly following/preceding the parent
                succ_tree = cand_tree.clone()
                succ_tree.create_child(node_num, right,
                                       NodeData(t_lemma, formeme))
                res.append(succ_tree)
                # if the parent already has some left/right children, try to place the new node
                # in all possible positions before/after their subtrees (for left/right child,
                # respectively)
                children_idxs = cand_tree.children_idxs(node_num,
                                                        left_only=not right,
                                                        right_only=right)
                for child_idx in children_idxs:
                    succ_tree = cand_tree.clone()
                    subtree_bound = succ_tree.subtree_bound(child_idx, right)
                    succ_tree.create_child(node_num,
                                           subtree_bound + (1 if right else 0),
                                           NodeData(t_lemma, formeme))
                    res.append(succ_tree)

        # if we have the tree classifier available, discard all successors that talk about something
        # not present in the current DA
        if self.classif and res:
            orig_len = len(res)
            is_subset = self.classif.is_subset_of_cur_da(res)
            res = [tree for tree, is_sub in zip(res, is_subset) if is_sub]
            final_len = len(res)
            if orig_len > final_len:
                log_debug('Tree classification reduced successors %d -> %d' %
                          (orig_len, final_len))
        # return all created successors
        return res
Esempio n. 20
0
    def _training_pass(self, pass_no):
        """Perform one training pass through the whole training data, print statistics."""

        pass_start_time = time.time()

        log_debug('\n***\nTR %05d:' % pass_no)
        log_debug("Train order: " + str(self.train_order))

        pass_cost = 0
        pass_diff = 0

        for tree_nos in self._batches():

            log_debug('TREE-NOS: ' + str(tree_nos))
            log_debug("\n".join(unicode(self.train_trees[i]) + "\n" + unicode(self.train_das[i])
                                for i in tree_nos))
            log_debug('Y: ' + str(self.y[tree_nos]))

            fd = {self.targets: self.y[tree_nos]}
            self._add_inputs_to_feed_dict(self.X[tree_nos], fd)
            if self.train_summary_dir:  # also compute Tensorboard summaries
                results, cost, _, train_summary_op = self.session.run(
                    [self.outputs, self.cost, self.train_func, self.train_summary_op], feed_dict=fd)
            else:
                results, cost, _ = self.session.run([self.outputs, self.cost, self.train_func],
                                                    feed_dict=fd)
            bin_result = np.array([[1. if r > 0 else 0. for r in result] for result in results])

            log_debug('R: ' + str(bin_result))
            log_debug('COST: %f' % cost)
            log_debug('DIFF: %d' % np.sum(np.abs(self.y[tree_nos] - bin_result)))

            pass_cost += cost
            pass_diff += np.sum(np.abs(self.y[tree_nos] - bin_result))

        # print and return statistics
        self._print_pass_stats(pass_no, datetime.timedelta(seconds=(time.time() - pass_start_time)),
                               pass_cost, pass_diff)
        if self.train_summary_dir:  # Tensorboard: iteration summary
            self.train_summary_writer.add_summary(train_summary_op, pass_no)

        return pass_cost, pass_diff
Esempio n. 21
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser()

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instsructions, for tokens only)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]

    # prepare evaluation
    if args.eval_file is None or args.eval_file.endswith('.txt'):  # just tokens
        gen_doc = []
    else:  # Trees: depending on PyTreex
        from pytreex.core.document import Document
        eval_doc = read_ttrees(args.eval_file)
        if args.ref_selector == args.target_selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    if args.eval_file:
        tgen.init_slot_err_stats()

    # generate
    log_info('Generating...')
    tgen.selector = args.target_selector  # override target selector for generation
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        tgen.generate_tree(da, gen_doc)

    # evaluate
    if args.eval_file is not None:
        log_info(tgen.get_slot_err_stats())
        # evaluate the generated tokens (F1 and BLEU scores)
        if args.eval_file.endswith('.txt'):
            lexicalize_tokens(gen_doc, lexicalization_from_doc(args.abstr_file))
            eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_doc)
        # evaluate the generated trees against golden trees
        else:
            eval_trees(das,
                       ttrees_from_doc(eval_doc, tgen.language, args.ref_selector),
                       ttrees_from_doc(gen_doc, tgen.language, args.target_selector),
                       eval_doc, tgen.language, tgen.selector)

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            write_tokens(gen_doc, args.output_file)
        else:
            write_ttrees(gen_doc, args.output_file)
Esempio n. 22
0
    def train(self, das_file, ttree_file, data_portion=1.0, context_file=None, validation_files=None):
        """Run parallel perceptron training, start and manage workers."""
        # initialize the ranker instance
        log_info('Initializing...')
        # run server to process registering clients
        self._init_server()
        # spawn training jobs
        log_info('Spawning jobs...')
        host_short, _ = self.host.split('.', 1)  # short host name for job names
        for j in range(self.jobs_number):
            # set up debugging logfile only if we have it on the head
            debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None'
            job = Job(header='from tgen.parallel_seq2seq_train import run_training',
                      code=('run_training("%s", %d, %s)' %
                            (self.host, self.port, debug_logfile)),
                      name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)),
                      work_dir=self.work_dir)
            job.submit(memory=self.job_memory, queue=self.queue_settings)
            self.jobs.append(job)

        # run the training passes
        try:
            cur_assign = 0
            results = [None] * self.jobs_number
            rnd_seeds = [rnd.random() for _ in range(self.jobs_number)]

            # assign training and wait for it to finish
            while cur_assign < self.jobs_number or self.pending_requests:
                log_debug('Starting loop over services.')

                # check if some of the pending computations have finished
                for sc, job_no, req in list(self.pending_requests):
                    res = self._check_pending_request(sc, job_no, req)
                    if res is not None:
                        results[job_no] = res, sc

                # check for free services and assign new computation
                while cur_assign < self.jobs_number and self.free_services:
                    log_debug('Assigning request %d' % cur_assign)
                    sc = self.free_services.popleft()
                    log_info('Assigning request %d to %s:%d' % (cur_assign, sc.host, sc.port))
                    if validation_files is not None:
                        validation_files = ','.join([os.path.relpath(f, self.work_dir)
                                                     for f in validation_files.split(',')])
                    train_func = async(sc.conn.root.train)
                    req = train_func(rnd_seeds[cur_assign],
                                     os.path.relpath(das_file, self.work_dir),
                                     os.path.relpath(ttree_file, self.work_dir),
                                     data_portion,
                                     os.path.relpath(context_file, self.work_dir)
                                     if context_file else None,
                                     validation_files)
                    self.pending_requests.add((sc, cur_assign, req))
                    cur_assign += 1
                    log_debug('Assigned %d' % cur_assign)

                # sleep for a while
                log_debug('Sleeping.')
                time.sleep(self.poll_interval)

            log_info("Results:\n" + "\n".join("%.5f %s:%d" % (cost, sc.host, sc.port)
                                              for cost, sc in results))

            self.model_temp_path = os.path.join(self.work_dir, self.TEMPFILE_NAME)
            results.sort(key=lambda res: res[0])
            # average the computed models
            if self.average_models:
                log_info('Creating ensemble models...')
                # use only top k if required
                results_for_ensemble = (results[:self.average_models_top_k]
                                        if self.average_models_top_k > 0
                                        else results)
                ensemble_model = self.build_ensemble_model(results_for_ensemble)
                log_info('Saving the ensemble model temporarily to %s...' % self.model_temp_path)
                ensemble_model.save_to_file(self.model_temp_path)
            # select the best result on devel data + save it
            else:
                best_cost, best_sc = results[0]
                log_info('Best cost: %f (computed at %s:%d).' % (best_cost, best_sc.host, best_sc.port))
                log_info('Saving best generator temporarily to %s...' % self.model_temp_path)
                # use relative path (working directory of worker jobs is different)
                best_sc.conn.root.save_model(os.path.relpath(self.model_temp_path, self.work_dir))

        # kill all jobs
        finally:
            for job in self.jobs:
                job.delete()
Esempio n. 23
0
    def _training_pass(self, pass_no):
        """Perform one training pass through the whole training data, print statistics."""

        pass_start_time = time.time()

        log_debug('\n***\nTR %05d:' % pass_no)
        log_debug("Train order: " + str(self.train_order))

        pass_cost = 0
        pass_diff = 0

        for tree_nos in self.batches():

            log_debug('TREE-NOS: ' + str(tree_nos))
            log_debug("\n".join(unicode(self.train_trees[i]) + "\n" + unicode(self.train_das[i])
                                for i in tree_nos))
            log_debug('Y: ' + str(self.y[tree_nos]))

            results = self.classif.classif(self.X[tree_nos])
            cost_gcost = self.classif.update(self.X[tree_nos], self.y[tree_nos], self.alpha)
            bin_result = np.array([[1. if r > 0.5 else 0. for r in result] for result in results])

            log_debug('R: ' + str(bin_result))
            log_debug('COST: %f' % cost_gcost[0])
            log_debug('DIFF: %d' % np.sum(np.abs(self.y[tree_nos] - bin_result)))

            pass_cost += cost_gcost[0]
            pass_diff += np.sum(np.abs(self.y[tree_nos] - bin_result))

        # print and return statistics
        self._print_pass_stats(pass_no, datetime.timedelta(seconds=(time.time() - pass_start_time)),
                               pass_cost, pass_diff)
Esempio n. 24
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser()

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
    log_info(tgen.get_slot_err_stats())

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector,
                                args.target_selector or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_trees)

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            write_tokens(gen_trees, args.output_file)
        else:
            write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language,
                                          args.target_selector or tgen.selector),
                         args.output_file)
Esempio n. 25
0
def asearch_gen(args):
    """A*search generation"""
    from pytreex.core.document import Document

    opts, files = getopt(args, 'e:d:w:c:s:')
    eval_file = None
    fname_ttrees_out = None
    cfg_file = None
    eval_selector = ''

    for opt, arg in opts:
        if opt == '-e':
            eval_file = arg
        elif opt == '-s':
            eval_selector = arg
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-w':
            fname_ttrees_out = arg
        elif opt == '-c':
            cfg_file = arg

    if len(files) != 3:
        sys.exit('Invalid arguments.\n' + __doc__)
    fname_cand_model, fname_rank_model, fname_da_test = files

    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
    ranker = PerceptronRanker.load_from_file(fname_rank_model)
    cfg = Config(cfg_file) if cfg_file else {}
    cfg.update({'candgen': candgen, 'ranker': ranker})
    tgen = ASearchPlanner(cfg)

    log_info('Generating...')
    das = read_das(fname_da_test)

    if eval_file is None:
        gen_doc = Document()
    else:
        eval_doc = read_ttrees(eval_file)
        if eval_selector == tgen.selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    # generate and evaluate
    if eval_file is not None:
        # generate + analyze open&close lists
        lists_analyzer = ASearchListsAnalyzer()
        for num, (da, gold_tree) in enumerate(zip(
                das, trees_from_doc(eval_doc, tgen.language, eval_selector)),
                                              start=1):
            log_debug("\n\nTREE No. %03d" % num)
            gen_tree = tgen.generate_tree(da, gen_doc)
            lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list)
            if gen_tree != gold_tree:
                log_debug("\nDIFFING TREES:\n" +
                          tgen.ranker.diffing_trees_with_scores(
                              da, gold_tree, gen_tree) + "\n")

        log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' %
                 lists_analyzer.stats())

        # evaluate the generated trees against golden trees
        eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
        gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)

        log_info('Evaluating...')
        evaler = Evaluator()
        for eval_bundle, eval_ttree, gen_ttree, da in zip(
                eval_doc.bundles, eval_ttrees, gen_ttrees, das):
            # add some stats about the tree directly into the output file
            add_bundle_text(
                eval_bundle, tgen.language, tgen.selector + 'Xscore',
                "P: %.4f R: %.4f F1: %.4f" %
                p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))

            # collect overall stats
            evaler.append(eval_ttree, gen_ttree,
                          ranker.score(TreeData.from_ttree(eval_ttree), da),
                          ranker.score(TreeData.from_ttree(gen_ttree), da))
        # print overall stats
        log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" %
                 evaler.p_r_f1())
        log_info("DEP  precision: %.4f, Recall: %.4f, F1: %.4f" %
                 evaler.p_r_f1(EvalTypes.DEP))
        log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" %
                 evaler.size_stats())
        log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" %
                 evaler.score_stats())
        log_info(
            "Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
            evaler.common_substruct_stats())
    # just generate
    else:
        for da in das:
            tgen.generate_tree(da, gen_doc)

    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Esempio n. 26
0
    def _training_pass(self, pass_no):
        """Perform one training pass through the whole training data, print statistics."""

        pass_start_time = time.time()

        log_debug('\n***\nTR %05d:' % pass_no)
        log_debug("Train order: " + str(self.train_order))

        pass_cost = 0
        pass_diff = 0

        for tree_nos in self._batches():

            log_debug('TREE-NOS: ' + str(tree_nos))
            log_debug("\n".join(
                unicode(self.train_trees[i]) + "\n" +
                unicode(self.train_das[i]) for i in tree_nos))
            log_debug('Y: ' + str(self.y[tree_nos]))

            fd = {self.targets: self.y[tree_nos]}
            self._add_inputs_to_feed_dict(self.X[tree_nos], fd)
            if self.train_summary_dir:  # also compute Tensorboard summaries
                results, cost, _, train_summary_op = self.session.run(
                    [
                        self.outputs, self.cost, self.train_func,
                        self.train_summary_op
                    ],
                    feed_dict=fd)
            else:
                results, cost, _ = self.session.run(
                    [self.outputs, self.cost, self.train_func], feed_dict=fd)
            bin_result = np.array([[1. if r > 0 else 0. for r in result]
                                   for result in results])

            log_debug('R: ' + str(bin_result))
            log_debug('COST: %f' % cost)
            log_debug('DIFF: %d' %
                      np.sum(np.abs(self.y[tree_nos] - bin_result)))

            pass_cost += cost
            pass_diff += np.sum(np.abs(self.y[tree_nos] - bin_result))

        # print and return statistics
        self._print_pass_stats(
            pass_no,
            datetime.timedelta(seconds=(time.time() - pass_start_time)),
            pass_cost, pass_diff)
        if self.train_summary_dir:  # Tensorboard: iteration summary
            self.train_summary_writer.add_summary(train_summary_op, pass_no)

        return pass_cost, pass_diff
Esempio n. 27
0
def asearch_gen(args):
    """A*search generation"""
    from pytreex.core.document import Document

    opts, files = getopt(args, 'e:d:w:c:s:')
    eval_file = None
    fname_ttrees_out = None
    cfg_file = None
    eval_selector = ''

    for opt, arg in opts:
        if opt == '-e':
            eval_file = arg
        elif opt == '-s':
            eval_selector = arg
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-w':
            fname_ttrees_out = arg
        elif opt == '-c':
            cfg_file = arg

    if len(files) != 3:
        sys.exit('Invalid arguments.\n' + __doc__)
    fname_cand_model, fname_rank_model, fname_da_test = files

    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
    ranker = PerceptronRanker.load_from_file(fname_rank_model)
    cfg = Config(cfg_file) if cfg_file else {}
    cfg.update({'candgen': candgen, 'ranker': ranker})
    tgen = ASearchPlanner(cfg)

    log_info('Generating...')
    das = read_das(fname_da_test)

    if eval_file is None:
        gen_doc = Document()
    else:
        eval_doc = read_ttrees(eval_file)
        if eval_selector == tgen.selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    # generate and evaluate
    if eval_file is not None:
        # generate + analyze open&close lists
        lists_analyzer = ASearchListsAnalyzer()
        for num, (da, gold_tree) in enumerate(zip(das,
                                                  trees_from_doc(eval_doc, tgen.language, eval_selector)),
                                              start=1):
            log_debug("\n\nTREE No. %03d" % num)
            gen_tree = tgen.generate_tree(da, gen_doc)
            lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list)
            if gen_tree != gold_tree:
                log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n")

        log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats())

        # evaluate the generated trees against golden trees
        eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
        gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)

        log_info('Evaluating...')
        evaler = Evaluator()
        for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das):
            # add some stats about the tree directly into the output file
            add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore',
                            "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))

            # collect overall stats
            evaler.append(eval_ttree,
                          gen_ttree,
                          ranker.score(TreeData.from_ttree(eval_ttree), da),
                          ranker.score(TreeData.from_ttree(gen_ttree), da))
        # print overall stats
        log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1())
        log_info("DEP  precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP))
        log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats())
        log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats())
        log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
                 evaler.common_substruct_stats())
    # just generate
    else:
        for da in das:
            tgen.generate_tree(da, gen_doc)

    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Esempio n. 28
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""
    def write_trees_or_tokens(output_file, das, gen_trees, base_doc, language,
                              selector):
        """Decide to write t-trees or tokens based on the output file name."""
        if output_file.endswith('.txt'):
            gen_toks = [t.to_tok_list() for t in gen_trees]
            postprocess_tokens(gen_toks, das)
            write_tokens(gen_toks, output_file)
        else:
            write_ttrees(
                create_ttree_doc(gen_trees, base_doc, language, selector),
                output_file)

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-e',
                    '--eval-file',
                    type=str,
                    help='A ttree/text file for evaluation')
    ap.add_argument(
        '-a',
        '--abstr-file',
        type=str,
        help=
        'Lexicalization file (a.k.a. abstraction instructions, for postprocessing)'
    )
    ap.add_argument('-r',
                    '--ref-selector',
                    type=str,
                    default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument(
        '-t',
        '--target-selector',
        type=str,
        default='',
        help='Target selector for generated trees in the output file')
    ap.add_argument('-d',
                    '--debug-logfile',
                    type=str,
                    help='Debug output file name')
    ap.add_argument('-w',
                    '--output-file',
                    type=str,
                    help='Output tree/text file')
    ap.add_argument('-D',
                    '--delex-output-file',
                    type=str,
                    help='Output file for trees/text before lexicalization')
    ap.add_argument('-b',
                    '--beam-size',
                    type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c',
                    '--context-file',
                    type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file',
                    type=str,
                    help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files (DAs, contexts)
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn(
                'Generator is not trained to use context, ignoring context input file.'
            )
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]
    elif tgen.use_context or tgen.context_bleu_weight:
        log_warn('Generator is trained to use context. ' +
                 'Using empty contexts, expect lower performance.')
        das = [([], da) for da in das]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
        if num % 100 == 0:
            log_info("Generated tree %d" % num)
    log_info(tgen.get_slot_err_stats())

    if args.delex_output_file is not None:
        log_info('Writing delex output...')
        write_trees_or_tokens(args.delex_output_file, das, gen_trees, None,
                              tgen.language, args.target_selector
                              or tgen.selector)

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language,
                                args.ref_selector, args.target_selector
                                or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # we won't need contexts anymore, but we do need DAs
    if tgen.use_context or tgen.context_bleu_weight:
        das = [da for _, da in das]

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True),
                    [t.to_tok_list() for t in gen_trees])

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        write_trees_or_tokens(args.output_file, das, gen_trees, eval_doc,
                              tgen.language, args.target_selector
                              or tgen.selector)
Esempio n. 29
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files (DAs, contexts)
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]
    elif tgen.use_context or tgen.context_bleu_weight:
        log_warn('Generator is trained to use context. ' +
                 'Using empty contexts, expect lower performance.')
        das = [([], da) for da in das]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
        if num % 100 == 0:
            log_info("Generated tree %d" % num)
    log_info(tgen.get_slot_err_stats())

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector,
                                args.target_selector or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # we won't need contexts anymore, but we do need DAs
    if tgen.use_context or tgen.context_bleu_weight:
        das = [da for _, da in das]

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True),
                    [t.to_tok_list() for t in gen_trees])

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            gen_toks = [t.to_tok_list() for t in gen_trees]
            postprocess_tokens(gen_toks, das)
            write_tokens(gen_toks, args.output_file)
        else:
            write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language,
                                          args.target_selector or tgen.selector),
                         args.output_file)