Example #1
0
    def test_brevity_penalty(self):
        # Test case from brevity_penalty_closest function in mteval-v13a.pl.
        # Same test cases as in the doctest in nltk.translate.bleu_score.py
        references = [['a'] * 11, ['a'] * 8]
        hypothesis = ['a'] * 7
        hyp_len = len(hypothesis)
        closest_ref_len =  closest_ref_length(references, hyp_len)
        self.assertAlmostEqual(brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4)

        references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
        hypothesis = ['a'] * 7
        hyp_len = len(hypothesis)
        closest_ref_len =  closest_ref_length(references, hyp_len)
        assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
Example #2
0
    def test_brevity_penalty(self):
        # Test case from brevity_penalty_closest function in mteval-v13a.pl.
        # Same test cases as in the doctest in nltk.translate.bleu_score.py
        references = [['a'] * 11, ['a'] * 8]
        hypothesis = ['a'] * 7
        hyp_len = len(hypothesis)
        closest_ref_len =  closest_ref_length(references, hyp_len)
        self.assertAlmostEqual(brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4)

        references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
        hypothesis = ['a'] * 7
        hyp_len = len(hypothesis)
        closest_ref_len =  closest_ref_length(references, hyp_len)
        assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
Example #3
0
def bleu_advanced(y_true: List[Any], y_predicted: List[Any],
                  weights: Tuple = (1,), smoothing_function=SMOOTH.method1,
                  auto_reweigh=False, penalty=True) -> float:
    """Calculate BLEU score

    Parameters:
        y_true: list of reference tokens
        y_predicted: list of query tokens
        weights: n-gram weights
        smoothing_function: SmoothingFunction
        auto_reweigh: Option to re-normalize the weights uniformly
        penalty: either enable brevity penalty or not

    Return:
        BLEU score
    """

    bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh)

    hyp_len = len(y_predicted)
    hyp_lengths = hyp_len
    ref_lengths = closest_ref_length([y_true], hyp_len)

    bpenalty = brevity_penalty(ref_lengths, hyp_lengths)

    if penalty is True or bpenalty == 0:
        return bleu_measure

    return bleu_measure / bpenalty
Example #4
0
def bleu_advanced(y_true: List[Any], y_predicted: List[Any],
                  weights: Tuple=(1,), smoothing_function=SMOOTH.method1,
                  auto_reweigh=False, penalty=True) -> float:
    """Calculate BLEU score

    Parameters:
        y_true: list of reference tokens
        y_predicted: list of query tokens
        weights: n-gram weights
        smoothing_function: SmoothingFunction
        auto_reweigh: Option to re-normalize the weights uniformly
        penalty: either enable brevity penalty or not

    Return:
        BLEU score
    """

    bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh)

    hyp_len = len(y_predicted)
    hyp_lengths = hyp_len
    ref_lengths = closest_ref_length([y_true], hyp_len)

    bpenalty = brevity_penalty(ref_lengths, hyp_lengths)

    if penalty is True or bpenalty == 0:
        return bleu_measure

    return bleu_measure/bpenalty
Example #5
0
def calculate_bleu(list_of_references, hypotheses):
    '''
    by Qiu
    '''
    hyp_lengths, ref_lengths = 0, 0
    for references, hypothesis in zip(list_of_references, hypotheses):
        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)
    try:
        bp = brevity_penalty(ref_lengths, hyp_lengths)
        bleu1 = corpus_bleu(
            list_of_references, hypotheses, weights=(1, 0, 0, 0)) / bp * 100
        bleu2 = corpus_bleu(
            list_of_references, hypotheses, weights=(0, 1, 0, 0)) / bp * 100
        bleu3 = corpus_bleu(
            list_of_references, hypotheses, weights=(0, 0, 1, 0)) / bp * 100
        bleu4 = corpus_bleu(
            list_of_references, hypotheses, weights=(0, 0, 0, 1)) / bp * 100
        bleu_all = corpus_bleu(list_of_references,
                               hypotheses,
                               weights=(0.25, 0.25, 0.25, 0.25)) * 100
    except Exception:
        bleu_all, bleu1, bleu2, bleu3, bleu4 = 0.0, 0.0, 0.0, 0.0, 0.0

    return bleu_all, bleu1, bleu2, bleu3, bleu4
Example #6
0
def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=None, auto_reweigh=False,
                emulate_multibleu=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    if len(list_of_references) != len(hypotheses):
        print ("The number of hypotheses and their reference(s) should be the same")
        return (0, *(0, 0, 0, 0), 0, 0, 0)

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len =  len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = ( 1 / hyp_lengths ,) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
           for i, _ in enumerate(weights, start=1)]

    p_n_ = [xx.numerator / xx.denominator * 100 for xx in p_n]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return (0, *(0, 0, 0, 0), 0, 0, 0)

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis,
                             hyp_len=hyp_len, emulate_multibleu=emulate_multibleu)
    s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n)))
    s =  bp * math.exp(math.fsum(s)) * 100
    final_bleu = round(s, 4) if emulate_multibleu else s
    return (final_bleu, *p_n_, bp, ref_lengths, hyp_lengths)
Example #7
0
    def get_brevity_penalty(self) -> float:
        """
            Calculate the brevity penalty of the corpus.

            Returns:
                Brevity penalty for corpus.
        """
        return brevity_penalty(self.total_references_length, self.total_hypotheses_length)
Example #8
0
def compute_bp(hypotheses, list_of_references):
    hyp_lengths, ref_lengths = 0, 0
    for references, hypothesis in zip(list_of_references, hypotheses):
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)
    return bp
def custom_corpus_bleu(list_of_references,
                       hypotheses,
                       weights=(0.25, 0.25, 0.25, 0.25)):
    from collections import Counter
    from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, Fraction
    import math

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(
        hypotheses
    ), "The number of hypotheses and their reference(s) should be the same"

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0, p_n, bp

    s = (w * math.log(p_i + 1e-12)
         for i, (w, p_i) in enumerate(zip(weights, p_n)))
    s = bp * math.exp(math.fsum(s))
    return s
Example #10
0
File: bleu.py Project: kaniblu/vhda
 def compute_bleu(self, ref: Sequence[Sequence[str]],
                  hyp: Sequence[Sequence[str]]):
     r_lens = list(sorted(set(map(len, ref))))
     chencherry = bl.SmoothingFunction()
     methods = (
         ("smooth0", chencherry.method0),
         ("smooth1", chencherry.method1),
         ("smooth2", chencherry.method2),
         ("smooth3", chencherry.method3),
         ("smooth4", chencherry.method4),
         ("smooth5", chencherry.method5),
         ("smooth6", chencherry.method6),
         ("smooth7", chencherry.method7)
     )
     if not hyp:
         return {name: 0.0 for name, _ in methods}
     stats = {
         name: np.mean([
             (self.try_compute(ref, h, smoothing_function=method) *
              bl.brevity_penalty(self.closest_rlen(r_lens, len(h)), len(h)))
             for h in hyp
         ]) for name, method in methods
     }
     return stats
Example #11
0
def corpus_bleu(list_of_references,
                hypotheses,
                weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=None,
                auto_reweigh=False,
                averaging_mode="geometric",
                no_length_penalty=False):
    """
    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
    the hypotheses and their respective references.

    Instead of averaging the sentence level BLEU scores (i.e. marco-average
    precision), the original BLEU metric (Papineni et al. 2002) accounts for
    the micro-average precision (i.e. summing the numerators and denominators
    for each hypothesis-reference(s) pairs before the division).

    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...         'ensures', 'that', 'the', 'military', 'always',
    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...          'heed', 'Party', 'commands']
    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...          'guarantees', 'the', 'military', 'forces', 'always',
    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
    ...          'of', 'the', 'party']

    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
    ...         'interested', 'in', 'world', 'history']
    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
    ...          'because', 'he', 'read', 'the', 'book']

    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
    >>> hypotheses = [hyp1, hyp2]
    >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
    0.5920...

    The example below show that corpus_bleu() is different from averaging
    sentence_bleu() for hypotheses

    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
    >>> score2 = sentence_bleu([ref2a], hyp2)
    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
    0.6223...

    :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
    :type list_of_references: list(list(list(str)))
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str))
    :param weights: weights for unigrams, bigrams, trigrams and so on
    :type weights: list(float)
    :param smoothing_function:
    :type smoothing_function: SmoothingFunction
    :param auto_reweigh: Option to re-normalize the weights uniformly.
    :type auto_reweigh: bool
    :return: The corpus-level BLEU score.
    :rtype: float
    """
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), (
        "The number of hypotheses and their reference(s) should be the "
        "same ")

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    if no_length_penalty and averaging_mode == 'geometric':
        bp = 1.0
    elif no_length_penalty and averaging_mode == 'arithmetic':
        bp = 0.0
    else:
        assert not no_length_penalty
        assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode'
        bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = (1 / hyp_lengths, ) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n,
                             references=references,
                             hypothesis=hypothesis,
                             hyp_len=hyp_lengths)

    if averaging_mode == "geometric":
        s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
        s = bp * math.exp(math.fsum(s))
    elif averaging_mode == "arithmetic":
        s = (w_i * p_i for w_i, p_i in zip(weights, p_n))
        s = math.fsum(s)

    return s
Example #12
0
def modified_corpus_bleu(list_of_references,
                         hypotheses,
                         weights=(0.25, 0.25, 0.25, 0.25),
                         smoothing_function=None,
                         auto_reweigh=False):
    """
    modified from nltk.translate.bleu_score.corpus_bleu,
    returns 'multi-bleu.perl'-like intermediate results.
    Args:
        list_of_references:
        hypotheses:
        weights:
        smoothing_function:
        auto_reweigh:

    Returns:

    """
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), f"The number of hypotheses and their reference(s) should be " \
                                                       f"the same: {len(list_of_references)} != {len(hypotheses)}"

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = (1 / hyp_lengths, ) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n,
                             references=references,
                             hypothesis=hypothesis,
                             hyp_len=hyp_len)
    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
    s = bp * math.exp(math.fsum(s))
    return s, p_n, bp, hyp_lengths / ref_lengths, hyp_lengths, ref_lengths