Ejemplo n.º 1
0
    def _compute_bleu(self, cur_valid_out, valid_trees):
        """Compute BLEU score of the current output on a set of validation trees. If the
        validation set is a tuple (two paraphrases), use them both for BLEU computation.

        @param cur_valid_out: the current system output on the validation DAs
        @param valid_trees: the gold trees for the validation DAs (one or two paraphrases)
        @return: BLEU score, as a float (percentage)
        """
        evaluator = BLEUMeasure()
        for pred_tree, gold_trees in zip(cur_valid_out, valid_trees):
            evaluator.append(pred_tree, gold_trees)
        return evaluator.bleu()
Ejemplo n.º 2
0
    def _compute_bleu(self, cur_valid_out, valid_trees):
        """Compute BLEU score of the current output on a set of validation trees. If the
        validation set is a tuple (two paraphrases), use them both for BLEU computation.

        @param cur_valid_out: the current system output on the validation DAs
        @param valid_trees: the gold trees for the validation DAs (one or two paraphrases)
        @return: BLEU score, as a float (percentage)
        """
        evaluator = BLEUMeasure()
        for pred_tree, gold_trees in zip(cur_valid_out, valid_trees):
            evaluator.append(pred_tree, gold_trees)
        return evaluator.bleu()
Ejemplo n.º 3
0
    def _rerank_paths(self, paths, da):
        """Rerank the n-best decoded paths according to the reranking classifier and/or
        BLEU against context."""

        trees = [
            self.tree_embs.ids_to_tree(
                np.array(path.dec_inputs).transpose()[0]) for path in paths
        ]

        # rerank using BLEU against context if set to do so
        if self.context_bleu_weight:
            bm = BLEUMeasure(max_ngram=2)
            bleus = []
            for path, tree in zip(paths, trees):
                bm.reset()
                bm.append([(n.t_lemma, None) for n in tree.nodes[1:]], [da[0]])
                bleu = (bm.ngram_precision() if self.context_bleu_metric
                        == 'ngram_prec' else bm.bleu())
                bleus.append(bleu)
                path.logprob += self.context_bleu_weight * bleu

            log_debug(("BLEU for context: %s\n\n" %
                       " ".join([form for form, _ in da[0]])) +
                      "\n".join([("%.5f\t" % b) +
                                 " ".join([n.t_lemma for n in t.nodes[1:]])
                                 for b, t in zip(bleus, trees)]))

        # add distances to logprob so that non-fitting will be heavily penalized
        if self.classif_filter:
            self.classif_filter.init_run(da)
            fits = self.classif_filter.dist_to_cur_da(trees)
            for path, fit in zip(paths, fits):
                path.logprob -= self.misfit_penalty * fit

            log_debug(("Misfits for DA: %s\n\n" % str(da)) +
                      "\n".join([("%.5f\t" % fit) + " ".join(
                          [unicode(n.t_lemma) for n in tree.nodes[1:]])
                                 for fit, tree in zip(fits, trees)]))

        # adjust paths for length (if set to do so)
        if self.length_norm_weight:
            for path in paths:
                path.logprob /= len(path)**self.length_norm_weight

        return sorted(paths,
                      cmp=lambda p, q: cmp(p.logprob, q.logprob),
                      reverse=True)
Ejemplo n.º 4
0
def eval_tokens(das, eval_tokens, gen_tokens):
    """Evaluate generated tokens and print out statistics."""
    postprocess_tokens(eval_tokens, das)
    postprocess_tokens(gen_tokens, das)

    evaluator = BLEUMeasure()
    for pred_sent, gold_sents in zip(gen_tokens, eval_tokens):
        evaluator.append(pred_sent, gold_sents)
    log_info("BLEU score: %.4f" % (evaluator.bleu() * 100))

    evaluator = Evaluator()
    for pred_sent, gold_sents in zip(gen_tokens, eval_tokens):
        for gold_sent in gold_sents:  # effectively an average over all gold paraphrases
            evaluator.append(gold_sent, pred_sent)

    log_info("TOKEN precision: %.4f, Recall: %.4f, F1: %.4f" % evaluator.p_r_f1(EvalTypes.TOKEN))
    log_info("Sentence length stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaluator.size_stats())
    log_info("Common subphrase stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
             evaluator.common_substruct_stats())
Ejemplo n.º 5
0
def eval_tokens(das, eval_tokens, gen_tokens):
    """Evaluate generated tokens and print out statistics."""
    postprocess_tokens(eval_tokens, das)
    postprocess_tokens(gen_tokens, das)

    evaluator = BLEUMeasure()
    for pred_sent, gold_sents in zip(gen_tokens, eval_tokens):
        evaluator.append(pred_sent, gold_sents)
    log_info("BLEU score: %.4f" % (evaluator.bleu() * 100))

    evaluator = Evaluator()
    for pred_sent, gold_sents in zip(gen_tokens, eval_tokens):
        for gold_sent in gold_sents:  # effectively an average over all gold paraphrases
            evaluator.append(gold_sent, pred_sent)

    log_info("TOKEN precision: %.4f, Recall: %.4f, F1: %.4f" % evaluator.p_r_f1(EvalTypes.TOKEN))
    log_info("Sentence length stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaluator.size_stats())
    log_info("Common subphrase stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
             evaluator.common_substruct_stats())
Ejemplo n.º 6
0
    def _rerank_paths(self, paths, da):
        """Rerank the n-best decoded paths according to the reranking classifier and/or
        BLEU against context."""

        trees = [self.tree_embs.ids_to_tree(np.array(path.dec_inputs).transpose()[0])
                 for path in paths]

        # rerank using BLEU against context if set to do so
        if self.context_bleu_weight:
            bm = BLEUMeasure(max_ngram=2)
            bleus = []
            for path, tree in zip(paths, trees):
                bm.reset()
                bm.append([(n.t_lemma, None) for n in tree.nodes[1:]], [da[0]])
                bleu = (bm.ngram_precision()
                        if self.context_bleu_metric == 'ngram_prec'
                        else bm.bleu())
                bleus.append(bleu)
                path.logprob += self.context_bleu_weight * bleu

            log_debug(("BLEU for context: %s\n\n" % " ".join([form for form, _ in da[0]])) +
                      "\n".join([("%.5f\t" % b) + " ".join([n.t_lemma for n in t.nodes[1:]])
                                 for b, t in zip(bleus, trees)]))

        # add distances to logprob so that non-fitting will be heavily penalized
        if self.classif_filter:
            self.classif_filter.init_run(da)
            fits = self.classif_filter.dist_to_cur_da(trees)
            for path, fit in zip(paths, fits):
                path.logprob -= self.misfit_penalty * fit

            log_debug(("Misfits for DA: %s\n\n" % str(da)) +
                      "\n".join([("%.5f\t" % fit) +
                                 " ".join([unicode(n.t_lemma) for n in tree.nodes[1:]])
                                 for fit, tree in zip(fits, trees)]))

        # adjust paths for length (if set to do so)
        if self.length_norm_weight:
            for path in paths:
                path.logprob /= len(path) ** self.length_norm_weight

        return sorted(paths, cmp=lambda p, q: cmp(p.logprob, q.logprob), reverse=True)