Ejemplo n.º 1
0
    def eval_batch(self, data_iter: DataLoader, num_batches):
        def get_bleu_non0_lists(list_a, list_b):
            for idx, val in enumerate(list_a):
                if val == 1:
                    pos = idx
                    break
            return list_a[:pos], list_b[:pos]

        accuracy = {
            'loss': list(),
            'perfect': list(),
            'bleu1_non0': list(),
            'bleu1': list(),
            'bleu4': list(),
            'bleu_half': list()
        }
        accuracy = OrderedDict(sorted(accuracy.items()))

        for i in range(num_batches):
            eval_loss, output, x, y = self.get_eval_batch_data(data_iter)
            accuracy['loss'].append(eval_loss)

            for idx in range(len(output)):
                real = y.T[idx]
                predict = np.argmax(output, axis=2)[idx]

                accuracy['perfect'].append(
                    all([
                        True if r == p else False
                        for r, p in zip(real, predict)
                    ]))

                real_n0, predict_n0 = get_bleu_non0_lists(real, predict)
                accuracy['bleu1_non0'].append(
                    float(bleu.modified_precision([real_n0], predict_n0, n=1)))
                accuracy['bleu1'].append(
                    float(bleu.modified_precision([real], predict, n=1)))
                accuracy['bleu4'].append(
                    float(bleu.modified_precision([real], predict, n=4)))
                accuracy['bleu_half'].append(
                    float(
                        bleu.modified_precision([real],
                                                predict,
                                                n=self.model.seq_out_len / 2)))

        for k, v in accuracy.items():
            accuracy[k] = np.mean(v)

        return accuracy
Ejemplo n.º 2
0
def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=None, auto_reweigh=False,
                emulate_multibleu=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    if len(list_of_references) != len(hypotheses):
        print ("The number of hypotheses and their reference(s) should be the same")
        return (0, *(0, 0, 0, 0), 0, 0, 0)

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len =  len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = ( 1 / hyp_lengths ,) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
           for i, _ in enumerate(weights, start=1)]

    p_n_ = [xx.numerator / xx.denominator * 100 for xx in p_n]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return (0, *(0, 0, 0, 0), 0, 0, 0)

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis,
                             hyp_len=hyp_len, emulate_multibleu=emulate_multibleu)
    s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n)))
    s =  bp * math.exp(math.fsum(s)) * 100
    final_bleu = round(s, 4) if emulate_multibleu else s
    return (final_bleu, *p_n_, bp, ref_lengths, hyp_lengths)
Ejemplo n.º 3
0
def ngram(reference, candidate):
    p_gram = [0, 0, 0, 0]
    for i, w in enumerate(weights, start=0):
        p_i = modified_precision([reference], candidate, i + 1)
        p_gram[i] = float(p_i.numerator) / float(p_i.denominator)
    #print(p_gram)
    return p_gram
Ejemplo n.º 4
0
def get_bleu_scores(infile1, infile2, n=1):
    print "\nComputing BLEU score for n={}...".format(n)
    f1 = open(infile1)
    f2 = open(infile2)

    sentences_f1 = f1.readlines()
    sentences_f2 = f2.readlines()
    # weights = tuple([1/float(n) for i in range(n)])
    # print 'corpus_bleu: ', bleu_score.corpus_bleu(sentences_f1, sentences_f2, weights=weights)

    bleu_scores = []

    sentence_pairs = zip(sentences_f1, sentences_f2)

    for pair in sentence_pairs:
        # Ground truth hypothesis
        reference = pair[0][:-1]
        # Generated hypothesis
        hypothesis = pair[1][:-1]

        score = bleu_score.modified_precision([reference], hypothesis, n)
        bleu_scores.append(float(score))

    avg_bleu_score = sum(bleu_scores) / float(len(bleu_scores))
    print "Average BLEU score: {}\n".format(avg_bleu_score)
Ejemplo n.º 5
0
def mp(targets, responses, n_bleu=1):
    """
    Computes modified precision scores for a bunch of response-target pairs
    """
    return np.array([float(bleu_score.modified_precision(targets[i],
                                                         responses[i],
                                                         n=n_bleu))
                     for i in range(len(targets))])
Ejemplo n.º 6
0
    def get_modified_precision_scores(self, reference, hypothesis, n_max_=4):
        ps = []

        for i in range(2, n_max_ + 1):
            p_i = modified_precision([reference], hypothesis, i)
            self.get_p_numerators()[i] = p_i.numerator
            self.get_p_denominators()[i] = p_i.denominator
            ps.append(p_i)

        return ps
def get_bleu_score(premise, hypothesis, n):
    try:
        score = float(bleu_score.modified_precision(
            [re.findall(r"\w+", hypothesis)],
            re.findall(r"\w+", premise),
            n
        ))
        return score
    except ZeroDivisionError:
        return 0.0
Ejemplo n.º 8
0
def corpus_nist(list_of_references, hypotheses, n=5):
    """
    Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
    the hypotheses and their respective references.

    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
    :type references: list(list(list(str)))
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str))
    :param n: highest n-gram order
    :type n: int
    """
    # Before proceeding to compute NIST, perform sanity checks.
    assert len(list_of_references) == len(
        hypotheses), "The number of hypotheses and their reference(s) should be the same"

    # Key = ngram order, and value = no. of ngram matches.
    p_numerators = Counter()
    # Key = ngram order, and value = no. of ngram in ref.
    p_denominators = Counter()
    # Key = ngram order, and value = no. of ngram in hyp.
    sysoutput_lengths = Counter()
    hyp_lengths, ref_lengths = 0, 0

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(range(1, n + 1)):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator
            # Adds the no. of ngrams in the hypothesis.
            sysoutput_lengths[i] += len(hypothesis) - (i - 1)

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = nist_length_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
           for i, _ in enumerate(range(1, n + 1))]

    # Eqn 2 in Doddington (2002):
    # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
    info = [0 if p_n[i].numerator == 0 or p_n[i + 1].numerator == 0  # Handles math domain and zero division errors.
            else math.log(p_n[i].numerator / p_n[i + 1].numerator)
            for i in range(len(p_n) - 1)]
    return sum(info_i / sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
Ejemplo n.º 9
0
def custom_corpus_bleu(list_of_references,
                       hypotheses,
                       weights=(0.25, 0.25, 0.25, 0.25)):
    from collections import Counter
    from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, Fraction
    import math

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(
        hypotheses
    ), "The number of hypotheses and their reference(s) should be the same"

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0, p_n, bp

    s = (w * math.log(p_i + 1e-12)
         for i, (w, p_i) in enumerate(zip(weights, p_n)))
    s = bp * math.exp(math.fsum(s))
    return s
Ejemplo n.º 10
0
def corpus_nist(list_of_references, hypotheses, n=5):
    """
    Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
    the hypotheses and their respective references.

    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
    :type references: list(list(list(str)))
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str))
    :param n: highest n-gram order
    :type n: int
    """
    # Before proceeding to compute NIST, perform sanity checks.
    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"

    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    sysoutput_lengths = Counter() # Key = ngram order, and value = no. of ngram in hyp.
    hyp_lengths, ref_lengths = 0, 0

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(range(1,n+1)):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator
            # Adds the no. of ngrams in the hypothesis.
            sysoutput_lengths[i] += len(hypothesis) - (i - 1)

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len =  len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = nist_length_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
           for i, _ in enumerate(range(1,n+1))]

    # Eqn 2 in Doddington (2002):
    # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
    info = [0 if p_n[i].numerator == 0 or p_n[i+1].numerator == 0 # Handles math domain and zero division errors.
            else math.log(p_n[i].numerator / p_n[i+1].numerator)
            for i in range(len(p_n)-1)]
    return sum(info_i/sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
Ejemplo n.º 11
0
 def test_sentence_bleu(self):
     sm = SmoothingFunction()
     reference1 = 'the cat is on the mat'.split()
     reference2 = 'there is a cat on the mat'.split()
     references = [reference1, reference2]
     hypothesis1 = 'the the the the the the the'.split(
     )  # known to 'trick' BLEU
     hypothesis2 = 'the cat the the the the mat'.split(
     )  # should still score much higher for BLEU-1 (but not BLEU-4)
     score1_mod = float(modified_precision(references, hypothesis1, n=1))
     score1_bleu1 = sentence_bleu(references, hypothesis1, (1, 0, 0, 0))
     score1_bleu1_method5 = sentence_bleu(references,
                                          hypothesis1, (1, 0, 0, 0),
                                          smoothing_function=sm.method5)
     score1_bleu4 = sentence_bleu(references, hypothesis1)
     print(
         "Scores for hypothesis1 modified=%f, BLEU-1=%f (method5 %f), BLEU-4=%f"
         % (score1_mod, score1_bleu1, score1_bleu1_method5, score1_bleu4))
     score2_mod = float(modified_precision(references, hypothesis2, n=1))
     score2_bleu1 = sentence_bleu(references, hypothesis2, (1, 0, 0, 0))
     score2_bleu1_method5 = sentence_bleu(references,
                                          hypothesis2, (1, 0, 0, 0),
                                          smoothing_function=sm.method5)
     score2_bleu4 = sentence_bleu(references, hypothesis2)
     print(
         "Scores for hypothesis2 modified=%f, BLEU-1=%f (method5 %f), BLEU-4=%f"
         % (score2_mod, score2_bleu1, score2_bleu1_method5, score2_bleu4))
     self.assertLess(score1_mod, 0.3)
     self.assertGreater(score1_bleu1, 0.0)
     self.assertLess(score1_bleu1, 0.3)
     self.assertGreater(score1_bleu1, 0.0)
     self.assertLess(score1_bleu4, 0.1)
     self.assertLess(score2_mod, 0.6)
     self.assertGreater(score2_bleu1, 0.0)
     self.assertLess(score2_bleu1, 0.6)
     self.assertGreater(score2_bleu4, 0.0)
     self.assertLess(score2_bleu4, 0.1)
Ejemplo n.º 12
0
 def test_sentence_perfect1(self):
     reference1 = 'You eat a cheese sandwich'.split()
     reference2 = 'You\'re eating a cheese sandwich'.split()
     reference3 = 'You are eating a cheese sandwich'.split()
     references = [reference1, reference2, reference3]
     hypothesis1 = 'You eat a cheese sandwich'.split()
     score1_mod = float(modified_precision(references, hypothesis1, n=1))
     score1_bleu1 = sentence_bleu([reference1], hypothesis1, (1, 0, 0, 0))
     score1_bleu4 = sentence_bleu([reference1], hypothesis1)
     score_bleu4 = sentence_bleu(references, hypothesis1)
     print(
         "Scores for hypothesis modified=%f, BLEU-1=%f, BLEU-4=%f (all %f)"
         % (score1_mod, score1_bleu1, score1_bleu4, score_bleu4))
     self.assertGreaterEqual(score1_bleu4, 1.0)
     self.assertGreaterEqual(score_bleu4, 1.0)
def cal_BLEU(hypp, reff, data, ngram, debug):
	hyp = []
	for s in hypp:
		hyp.append(prepare_for_bleu(s, data, ngram))
	ref = []
	for s in reff:
		ref.append(prepare_for_bleu(s, data, ngram))
	# Check URL below for this powerful package (modified_precision):
	# http://www.nltk.org/api/nltk.translate.html#nltk.translate.bleu_score.modified_precision
	bleu = [0.] * len(ngram)
	if debug:
		print(hyp)
	for s in hyp:
		#print(s)
		for i in range(len(ngram)):
			bleu[i] += round(modified_precision(ref, s, n=ngram[i]), 5)
	return [x / len(hyp) for x in bleu]
Ejemplo n.º 14
0
def tokenize_modified_precision(reference, candidate, n=3, verbose=False):
    """

    :param reference:
    :param candidate:
    :param n:
    :return:
    # TODO:
    >>> tokenize_modified_precision("der String von a","der String von b", n=1 )
    0.75
    """
    if verbose:
        print("#" * 30 + "\r\ncandidate:\r\n", candidate, "\r\nreference:",
              reference)
    candidate, reference = tokenize(candidate), tokenize(reference)
    bleu = float(bleu_score.modified_precision([reference], candidate, n=n))

    return bleu
Ejemplo n.º 15
0
def mark_text(filename):
    print(filename)
    textframe = pd.read_csv(textdir + filename)
    newsframe = pd.read_csv(newsdir + filename)
    newstext = []
    mark = []
    for item in newsframe['word_sequence']:
        if type(item) is not np.float:
            sentence = re.split(r'[//]', item)
            newstext.append(sentence)
    for item in textframe['word_sequence']:
        sentence = re.split(r'[//]', item)
        try:
            mark.append(float(modified_precision(references = newstext, hypothesis = sentence, n = 2)))
        except ZeroDivisionError:
            mark.append(0)
    textframe['f_score'] = mark
    # print(textframe[textframe['f_score'] != 0]['f_score'])
    textframe.to_csv(textdir + filename, index = False)
Ejemplo n.º 16
0
    def batch_precision_parameters(self, references: List[np.ndarray], hypotheses: List[np.ndarray]) -> List[float]:
        """
            Calculate modified precision per n_gram for input references and hypotheses combinations.

            Args:
                references: Ground truth sentences.
                hypotheses: Predicted sentences.

            Returns:
                List of sentence level bleu scores
        """

        assert len(references) == len(hypotheses), (
            "The number of hypotheses and their reference(s) should be the same ")

        sentence_level_scores = []
        # Iterate through each hypothesis and their corresponding references.
        for reference, hypothesis in zip(references, hypotheses):

            # For each order of ngram, calculate the correct predicted words and
            # total predicted words for the corpus-level modified precision.
            reference = get_formated_reference(reference)
            hypothesis = get_formated_list(hypothesis)
            for i in range(1, self.n_gram + 1):
                p_i = modified_precision(reference, hypothesis, i)
                self.no_of_correct_predicted[i] += p_i.numerator
                self.no_of_total_predicted[i] += p_i.denominator

            sentence_level_scores.append(sentence_bleu(reference, hypothesis, self.weights, self.smoothing_function))

            # Calculate the hypothesis length and the closest reference length.
            # Adds them to the corpus-level hypothesis and reference counts.
            hyp_len = len(hypothesis)
            self.total_hypotheses_length += hyp_len
            ref_lens = (len(ref) for ref in reference)
            self.total_references_length += min(ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len))

        return sentence_level_scores
Ejemplo n.º 17
0
def basic_precision_recall(r, h, display=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    r_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    r_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    metrics = {"rc": 0, "rt": 0, "tp": 0, "tc": 0, "word": {}}

    if display:
        print("total utts={0:d}".format(len(r)))

    i=1

    for references, hypothesis in zip(r, h):
        # if min([len(any_ref) for any_ref in references]) > 0:
        if len(hypothesis) > 0:
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

            metrics["tc"] += p_i.numerator
            metrics["tp"] += p_i.denominator
        else:
            p_numerators[i] += 0
            p_denominators[i] += 0

            metrics["tc"] += 0
            metrics["tp"] += 0

        #print(p_i.numerator, p_i.denominator)

        tot_match = 0
        tot_count = 0

        max_recall_match, max_tp, max_t, max_word_level_details = count_match(references[0], hypothesis)
        max_recall = max_recall_match / max_t if max_t > 0 else 0

        for curr_ref in references:
            curr_match, curr_tp, curr_t, curr_word_level_details = count_match(curr_ref, hypothesis)
            curr_recall = curr_match / curr_t if curr_t > 0 else 0

            if curr_recall > max_recall:
                max_recall_match = curr_match
                max_t = curr_t
                max_recall = curr_recall
                max_word_level_details = curr_word_level_details

        r_numerators[i] += max_recall_match
        r_denominators[i] += max_t
        metrics["rc"] += max_recall_match
        metrics["rt"] += max_t
        for key in {"t","tp","tc"}:
            for w in max_word_level_details[key]:
                if w not in metrics["word"]:
                    metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0}
                metrics["word"][w][key] += max_word_level_details[key][w]

    prec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(p_numerators.values(), p_denominators.values())]
    rec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(r_numerators.values(), r_denominators.values())]

    if display:
        print("{0:10s} | {1:>8s}".format("metric", "1-gram"))
        print("-"*54)
        print("{0:10s} | {1:8.2f}".format("precision", *prec))
        print("{0:10s} | {1:8.2f}".format("recall", *rec))

    return prec[0], rec[0], metrics
Ejemplo n.º 18
0
def ngram_overlap(s1, s2, n=1):
    w1 = s1.split(" ")
    w2 = s2.split(" ")
    return float(modified_precision([w1], w2, n))
Ejemplo n.º 19
0
def modified_corpus_bleu(list_of_references,
                         hypotheses,
                         weights=(0.25, 0.25, 0.25, 0.25),
                         smoothing_function=None,
                         auto_reweigh=False):
    """
    modified from nltk.translate.bleu_score.corpus_bleu,
    returns 'multi-bleu.perl'-like intermediate results.
    Args:
        list_of_references:
        hypotheses:
        weights:
        smoothing_function:
        auto_reweigh:

    Returns:

    """
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), f"The number of hypotheses and their reference(s) should be " \
                                                       f"the same: {len(list_of_references)} != {len(hypotheses)}"

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = (1 / hyp_lengths, ) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n,
                             references=references,
                             hypothesis=hypothesis,
                             hyp_len=hyp_len)
    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
    s = bp * math.exp(math.fsum(s))
    return s, p_n, bp, hyp_lengths / ref_lengths, hyp_lengths, ref_lengths
Ejemplo n.º 20
0
    def test_modified_precision(self):
        """
        Examples from the original BLEU paper 
        http://www.aclweb.org/anthology/P02-1040.pdf
        """
        # Example 1: the "the*" example.
        # Reference sentences.
        ref1 = 'the cat is on the mat'.split()
        ref2 = 'there is a cat on the mat'.split()
        # Hypothesis sentence(s).
        hyp1 = 'the the the the the the the'.split()
        
        references = [ref1, ref2] 
        
        # Testing modified unigram precision.
        hyp1_unigram_precision =  float(modified_precision(references, hyp1, n=1))
        assert (round(hyp1_unigram_precision, 4) == 0.2857)
        # With assertAlmostEqual at 4 place precision.
        self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
        
        # Testing modified bigram precision.
        assert(float(modified_precision(references, hyp1, n=2)) == 0.0)
        
        
        # Example 2: the "of the" example.
        # Reference sentences
        ref1 = str('It is a guide to action that ensures that the military '
                   'will forever heed Party commands').split()
        ref2 = str('It is the guiding principle which guarantees the military '
                   'forces always being under the command of the Party').split()
        ref3 = str('It is the practical guide for the army always to heed '
                   'the directions of the party').split()
        # Hypothesis sentence(s).
        hyp1 = 'of the'.split()
        
        references = [ref1, ref2, ref3] 
        # Testing modified unigram precision.
        assert (float(modified_precision(references, hyp1, n=1)) == 1.0)
        
        # Testing modified bigram precision.
        assert(float(modified_precision(references, hyp1, n=2)) == 1.0)
        

        # Example 3: Proper MT outputs.
        hyp1 = str('It is a guide to action which ensures that the military '
                   'always obeys the commands of the party').split()
        hyp2 = str('It is to insure the troops forever hearing the activity '
                   'guidebook that party direct').split()
        
        references = [ref1, ref2, ref3]
        
        # Unigram precision.
        hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
        hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
        # Test unigram precision with assertAlmostEqual at 4 place precision.
        self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
        self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
        # Test unigram precision with rounding.
        assert (round(hyp1_unigram_precision, 4) == 0.9444)
        assert (round(hyp2_unigram_precision, 4) == 0.5714)
        
        # Bigram precision
        hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
        hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
        # Test bigram precision with assertAlmostEqual at 4 place precision.
        self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
        self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
        # Test bigram precision with rounding.
        assert (round(hyp1_bigram_precision, 4) == 0.5882)
        assert (round(hyp2_bigram_precision, 4) == 0.0769)
Ejemplo n.º 21
0
def get_bleu_score(candidate, reference, ngrams):
    candidate = candidate.split(' ')
    reference = [reference.split(' ')]
    return float(modified_precision(reference, candidate, ngrams))
Ejemplo n.º 22
0
    return s

# print test results
i=0
j=81
answers=[]
for x in range(0,20):
    answers.append([])
for p in predictions:
    print str(j)+". Expected output: "+to_sentence(sentences[i])
    print str(j)+". Output: ",
    for w in p:
        if w!=0:
            print y_ix_to_word[w],
            answers[i].append(y_ix_to_word[w])
    print ""
    print ""
    i+=1
    j+=

from nltk.translate.bleu_score import modified_precision

# print BLEU scores for output sentences with reference to expected output.
j=81
for i in range(0,20):
    bs = float(modified_precision(sentences[i], answers[i],1))
    print "BLEU Score for video number "+str(j)+": "+str(bs)
    j+=1


Ejemplo n.º 23
0
def bleu(ref, hyp):
    return float(modified_precision([ref.split()], hyp.split(), n=1))
Ejemplo n.º 24
0
def corpus_bleu(list_of_references,
                hypotheses,
                weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=None,
                auto_reweigh=False,
                averaging_mode="geometric",
                no_length_penalty=False):
    """
    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
    the hypotheses and their respective references.

    Instead of averaging the sentence level BLEU scores (i.e. marco-average
    precision), the original BLEU metric (Papineni et al. 2002) accounts for
    the micro-average precision (i.e. summing the numerators and denominators
    for each hypothesis-reference(s) pairs before the division).

    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...         'ensures', 'that', 'the', 'military', 'always',
    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...          'heed', 'Party', 'commands']
    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...          'guarantees', 'the', 'military', 'forces', 'always',
    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
    ...          'of', 'the', 'party']

    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
    ...         'interested', 'in', 'world', 'history']
    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
    ...          'because', 'he', 'read', 'the', 'book']

    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
    >>> hypotheses = [hyp1, hyp2]
    >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
    0.5920...

    The example below show that corpus_bleu() is different from averaging
    sentence_bleu() for hypotheses

    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
    >>> score2 = sentence_bleu([ref2a], hyp2)
    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
    0.6223...

    :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
    :type list_of_references: list(list(list(str)))
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str))
    :param weights: weights for unigrams, bigrams, trigrams and so on
    :type weights: list(float)
    :param smoothing_function:
    :type smoothing_function: SmoothingFunction
    :param auto_reweigh: Option to re-normalize the weights uniformly.
    :type auto_reweigh: bool
    :return: The corpus-level BLEU score.
    :rtype: float
    """
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), (
        "The number of hypotheses and their reference(s) should be the "
        "same ")

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    if no_length_penalty and averaging_mode == 'geometric':
        bp = 1.0
    elif no_length_penalty and averaging_mode == 'arithmetic':
        bp = 0.0
    else:
        assert not no_length_penalty
        assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode'
        bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = (1 / hyp_lengths, ) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n,
                             references=references,
                             hypothesis=hypothesis,
                             hyp_len=hyp_lengths)

    if averaging_mode == "geometric":
        s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
        s = bp * math.exp(math.fsum(s))
    elif averaging_mode == "arithmetic":
        s = (w_i * p_i for w_i, p_i in zip(weights, p_n))
        s = math.fsum(s)

    return s
Ejemplo n.º 25
0
    def test_modified_precision(self):
        """
        Examples from the original BLEU paper
        http://www.aclweb.org/anthology/P02-1040.pdf
        """
        # Example 1: the "the*" example.
        # Reference sentences.
        ref1 = 'the cat is on the mat'.split()
        ref2 = 'there is a cat on the mat'.split()
        # Hypothesis sentence(s).
        hyp1 = 'the the the the the the the'.split()

        references = [ref1, ref2]

        # Testing modified unigram precision.
        hyp1_unigram_precision =  float(modified_precision(references, hyp1, n=1))
        assert (round(hyp1_unigram_precision, 4) == 0.2857)
        # With assertAlmostEqual at 4 place precision.
        self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)

        # Testing modified bigram precision.
        assert(float(modified_precision(references, hyp1, n=2)) == 0.0)


        # Example 2: the "of the" example.
        # Reference sentences
        ref1 = str('It is a guide to action that ensures that the military '
                   'will forever heed Party commands').split()
        ref2 = str('It is the guiding principle which guarantees the military '
                   'forces always being under the command of the Party').split()
        ref3 = str('It is the practical guide for the army always to heed '
                   'the directions of the party').split()
        # Hypothesis sentence(s).
        hyp1 = 'of the'.split()

        references = [ref1, ref2, ref3]
        # Testing modified unigram precision.
        assert (float(modified_precision(references, hyp1, n=1)) == 1.0)

        # Testing modified bigram precision.
        assert(float(modified_precision(references, hyp1, n=2)) == 1.0)


        # Example 3: Proper MT outputs.
        hyp1 = str('It is a guide to action which ensures that the military '
                   'always obeys the commands of the party').split()
        hyp2 = str('It is to insure the troops forever hearing the activity '
                   'guidebook that party direct').split()

        references = [ref1, ref2, ref3]

        # Unigram precision.
        hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
        hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
        # Test unigram precision with assertAlmostEqual at 4 place precision.
        self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
        self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
        # Test unigram precision with rounding.
        assert (round(hyp1_unigram_precision, 4) == 0.9444)
        assert (round(hyp2_unigram_precision, 4) == 0.5714)

        # Bigram precision
        hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
        hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
        # Test bigram precision with assertAlmostEqual at 4 place precision.
        self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
        self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
        # Test bigram precision with rounding.
        assert (round(hyp1_bigram_precision, 4) == 0.5882)
        assert (round(hyp2_bigram_precision, 4) == 0.0769)
Ejemplo n.º 26
0
def calculate_bleu(name,
                   dataset,
                   q_field,
                   a_field,
                   as_field,
                   QnA_vocab,
                   model,
                   device,
                   max_len=50):

    results = []
    precision_total = 0
    score = 0
    instances = len(dataset)

    for data in dataset:

        q = vars(data)['Q']
        a = vars(data)['A']
        a_sen = vars(data)['Ans_Sen']

        if name == "CNN":
            hypothesis, _ = cnn_predict(q, a, q_field, a_field, as_field,
                                        QnA_vocab, model, device, max_len)
        elif name == "ATTN":
            hypothesis, _ = attn_predict(q, a, q_field, a_field, as_field,
                                         QnA_vocab, model, device, max_len)
        elif name == "COPYNET":
            hypothesis = copynet_predict(q, a, q_field, a_field, as_field,
                                         QnA_vocab, model, device, max_len)
        else:
            hypothesis = []
        #cut off <eos> token

        hypothesis = hypothesis[:-1]
        reference = a_sen
        #reference = [t.lower() for t in a_sen]

        blue_score = example_score(reference, hypothesis)
        precision = float(modified_precision([reference], hypothesis, n=1))
        #print(precision)

        results.append({
            'question': q,
            'answer': a,
            'reference': reference,
            'hypothesis': hypothesis,
            'blue_score': blue_score,
            'precision': precision
        })

        score += blue_score
        precision_total += precision

    if name == "CNN":
        with open('CNN_results.txt', 'w') as file:
            file.write(json.dumps(results))

    elif name == "ATTN":
        with open('ATTN_results.txt', 'w') as file:
            file.write(json.dumps(results))

    elif name == "COPYNET":
        with open('COPYNET_results.txt', 'w') as file:
            file.write(json.dumps(results))

    return score / instances, precision_total / instances