def test_brevity_penalty(self): # Test case from brevity_penalty_closest function in mteval-v13a.pl. # Same test cases as in the doctest in nltk.translate.bleu_score.py references = [['a'] * 11, ['a'] * 8] hypothesis = ['a'] * 7 hyp_len = len(hypothesis) closest_ref_len = closest_ref_length(references, hyp_len) self.assertAlmostEqual(brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4) references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] hypothesis = ['a'] * 7 hyp_len = len(hypothesis) closest_ref_len = closest_ref_length(references, hyp_len) assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
def bleu_advanced(y_true: List[Any], y_predicted: List[Any], weights: Tuple = (1,), smoothing_function=SMOOTH.method1, auto_reweigh=False, penalty=True) -> float: """Calculate BLEU score Parameters: y_true: list of reference tokens y_predicted: list of query tokens weights: n-gram weights smoothing_function: SmoothingFunction auto_reweigh: Option to re-normalize the weights uniformly penalty: either enable brevity penalty or not Return: BLEU score """ bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh) hyp_len = len(y_predicted) hyp_lengths = hyp_len ref_lengths = closest_ref_length([y_true], hyp_len) bpenalty = brevity_penalty(ref_lengths, hyp_lengths) if penalty is True or bpenalty == 0: return bleu_measure return bleu_measure / bpenalty
def bleu_advanced(y_true: List[Any], y_predicted: List[Any], weights: Tuple=(1,), smoothing_function=SMOOTH.method1, auto_reweigh=False, penalty=True) -> float: """Calculate BLEU score Parameters: y_true: list of reference tokens y_predicted: list of query tokens weights: n-gram weights smoothing_function: SmoothingFunction auto_reweigh: Option to re-normalize the weights uniformly penalty: either enable brevity penalty or not Return: BLEU score """ bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh) hyp_len = len(y_predicted) hyp_lengths = hyp_len ref_lengths = closest_ref_length([y_true], hyp_len) bpenalty = brevity_penalty(ref_lengths, hyp_lengths) if penalty is True or bpenalty == 0: return bleu_measure return bleu_measure/bpenalty
def calculate_bleu(list_of_references, hypotheses): ''' by Qiu ''' hyp_lengths, ref_lengths = 0, 0 for references, hypothesis in zip(list_of_references, hypotheses): # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) try: bp = brevity_penalty(ref_lengths, hyp_lengths) bleu1 = corpus_bleu( list_of_references, hypotheses, weights=(1, 0, 0, 0)) / bp * 100 bleu2 = corpus_bleu( list_of_references, hypotheses, weights=(0, 1, 0, 0)) / bp * 100 bleu3 = corpus_bleu( list_of_references, hypotheses, weights=(0, 0, 1, 0)) / bp * 100 bleu4 = corpus_bleu( list_of_references, hypotheses, weights=(0, 0, 0, 1)) / bp * 100 bleu_all = corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)) * 100 except Exception: bleu_all, bleu1, bleu2, bleu3, bleu4 = 0.0, 0.0, 0.0, 0.0, 0.0 return bleu_all, bleu1, bleu2, bleu3, bleu4
def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, emulate_multibleu=False): p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 if len(list_of_references) != len(hypotheses): print ("The number of hypotheses and their reference(s) should be the same") return (0, *(0, 0, 0, 0), 0, 0, 0) # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): weights = ( 1 / hyp_lengths ,) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1)] p_n_ = [xx.numerator / xx.denominator * 100 for xx in p_n] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return (0, *(0, 0, 0, 0), 0, 0, 0) # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_len, emulate_multibleu=emulate_multibleu) s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n))) s = bp * math.exp(math.fsum(s)) * 100 final_bleu = round(s, 4) if emulate_multibleu else s return (final_bleu, *p_n_, bp, ref_lengths, hyp_lengths)
def compute_bp(hypotheses, list_of_references): hyp_lengths, ref_lengths = 0, 0 for references, hypothesis in zip(list_of_references, hypotheses): hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) return bp
def corpus_nist(list_of_references, hypotheses, n=5): """ Calculate a single corpus-level NIST score (aka. system-level BLEU) for all the hypotheses and their respective references. :param references: a corpus of lists of reference sentences, w.r.t. hypotheses :type references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param n: highest n-gram order :type n: int """ # Before proceeding to compute NIST, perform sanity checks. assert len(list_of_references) == len( hypotheses), "The number of hypotheses and their reference(s) should be the same" # Key = ngram order, and value = no. of ngram matches. p_numerators = Counter() # Key = ngram order, and value = no. of ngram in ref. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in hyp. sysoutput_lengths = Counter() hyp_lengths, ref_lengths = 0, 0 # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(range(1, n + 1)): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Adds the no. of ngrams in the hypothesis. sysoutput_lengths[i] += len(hypothesis) - (i - 1) # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = nist_length_penalty(ref_lengths, hyp_lengths) # Collects the various precision values for the different ngram orders. p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(range(1, n + 1))] # Eqn 2 in Doddington (2002): # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ] info = [0 if p_n[i].numerator == 0 or p_n[i + 1].numerator == 0 # Handles math domain and zero division errors. else math.log(p_n[i].numerator / p_n[i + 1].numerator) for i in range(len(p_n) - 1)] return sum(info_i / sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
def custom_corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)): from collections import Counter from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, Fraction import math p_numerators = Counter( ) # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter( ) # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len( hypotheses ), "The number of hypotheses and their reference(s) should be the same" # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0, p_n, bp s = (w * math.log(p_i + 1e-12) for i, (w, p_i) in enumerate(zip(weights, p_n))) s = bp * math.exp(math.fsum(s)) return s
def corpus_nist(list_of_references, hypotheses, n=5): """ Calculate a single corpus-level NIST score (aka. system-level BLEU) for all the hypotheses and their respective references. :param references: a corpus of lists of reference sentences, w.r.t. hypotheses :type references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param n: highest n-gram order :type n: int """ # Before proceeding to compute NIST, perform sanity checks. assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same" p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. sysoutput_lengths = Counter() # Key = ngram order, and value = no. of ngram in hyp. hyp_lengths, ref_lengths = 0, 0 # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(range(1,n+1)): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Adds the no. of ngrams in the hypothesis. sysoutput_lengths[i] += len(hypothesis) - (i - 1) # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = nist_length_penalty(ref_lengths, hyp_lengths) # Collects the various precision values for the different ngram orders. p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(range(1,n+1))] # Eqn 2 in Doddington (2002): # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ] info = [0 if p_n[i].numerator == 0 or p_n[i+1].numerator == 0 # Handles math domain and zero division errors. else math.log(p_n[i].numerator / p_n[i+1].numerator) for i in range(len(p_n)-1)] return sum(info_i/sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, averaging_mode="geometric", no_length_penalty=False): """ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all the hypotheses and their respective references. Instead of averaging the sentence level BLEU scores (i.e. marco-average precision), the original BLEU metric (Papineni et al. 2002) accounts for the micro-average precision (i.e. summing the numerators and denominators for each hypothesis-reference(s) pairs before the division). >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', ... 'interested', 'in', 'world', 'history'] >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', ... 'because', 'he', 'read', 'the', 'book'] >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] >>> hypotheses = [hyp1, hyp2] >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS 0.5920... The example below show that corpus_bleu() is different from averaging sentence_bleu() for hypotheses >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) >>> score2 = sentence_bleu([ref2a], hyp2) >>> (score1 + score2) / 2 # doctest: +ELLIPSIS 0.6223... :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses :type list_of_references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param weights: weights for unigrams, bigrams, trigrams and so on :type weights: list(float) :param smoothing_function: :type smoothing_function: SmoothingFunction :param auto_reweigh: Option to re-normalize the weights uniformly. :type auto_reweigh: bool :return: The corpus-level BLEU score. :rtype: float """ # Before proceeding to compute BLEU, perform sanity checks. p_numerators = Counter( ) # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter( ) # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len(hypotheses), ( "The number of hypotheses and their reference(s) should be the " "same ") # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. if no_length_penalty and averaging_mode == 'geometric': bp = 1.0 elif no_length_penalty and averaging_mode == 'arithmetic': bp = 0.0 else: assert not no_length_penalty assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode' bp = brevity_penalty(ref_lengths, hyp_lengths) # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): weights = (1 / hyp_lengths, ) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0 # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths) if averaging_mode == "geometric": s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n)) s = bp * math.exp(math.fsum(s)) elif averaging_mode == "arithmetic": s = (w_i * p_i for w_i, p_i in zip(weights, p_n)) s = math.fsum(s) return s
def modified_corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False): """ modified from nltk.translate.bleu_score.corpus_bleu, returns 'multi-bleu.perl'-like intermediate results. Args: list_of_references: hypotheses: weights: smoothing_function: auto_reweigh: Returns: """ # Before proceeding to compute BLEU, perform sanity checks. p_numerators = Counter( ) # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter( ) # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len(hypotheses), f"The number of hypotheses and their reference(s) should be " \ f"the same: {len(list_of_references)} != {len(hypotheses)}" # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): weights = (1 / hyp_lengths, ) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0 # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_len) s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n)) s = bp * math.exp(math.fsum(s)) return s, p_n, bp, hyp_lengths / ref_lengths, hyp_lengths, ref_lengths