def eval_batch(self, data_iter: DataLoader, num_batches): def get_bleu_non0_lists(list_a, list_b): for idx, val in enumerate(list_a): if val == 1: pos = idx break return list_a[:pos], list_b[:pos] accuracy = { 'loss': list(), 'perfect': list(), 'bleu1_non0': list(), 'bleu1': list(), 'bleu4': list(), 'bleu_half': list() } accuracy = OrderedDict(sorted(accuracy.items())) for i in range(num_batches): eval_loss, output, x, y = self.get_eval_batch_data(data_iter) accuracy['loss'].append(eval_loss) for idx in range(len(output)): real = y.T[idx] predict = np.argmax(output, axis=2)[idx] accuracy['perfect'].append( all([ True if r == p else False for r, p in zip(real, predict) ])) real_n0, predict_n0 = get_bleu_non0_lists(real, predict) accuracy['bleu1_non0'].append( float(bleu.modified_precision([real_n0], predict_n0, n=1))) accuracy['bleu1'].append( float(bleu.modified_precision([real], predict, n=1))) accuracy['bleu4'].append( float(bleu.modified_precision([real], predict, n=4))) accuracy['bleu_half'].append( float( bleu.modified_precision([real], predict, n=self.model.seq_out_len / 2))) for k, v in accuracy.items(): accuracy[k] = np.mean(v) return accuracy
def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, emulate_multibleu=False): p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 if len(list_of_references) != len(hypotheses): print ("The number of hypotheses and their reference(s) should be the same") return (0, *(0, 0, 0, 0), 0, 0, 0) # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): weights = ( 1 / hyp_lengths ,) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1)] p_n_ = [xx.numerator / xx.denominator * 100 for xx in p_n] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return (0, *(0, 0, 0, 0), 0, 0, 0) # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_len, emulate_multibleu=emulate_multibleu) s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n))) s = bp * math.exp(math.fsum(s)) * 100 final_bleu = round(s, 4) if emulate_multibleu else s return (final_bleu, *p_n_, bp, ref_lengths, hyp_lengths)
def ngram(reference, candidate): p_gram = [0, 0, 0, 0] for i, w in enumerate(weights, start=0): p_i = modified_precision([reference], candidate, i + 1) p_gram[i] = float(p_i.numerator) / float(p_i.denominator) #print(p_gram) return p_gram
def get_bleu_scores(infile1, infile2, n=1): print "\nComputing BLEU score for n={}...".format(n) f1 = open(infile1) f2 = open(infile2) sentences_f1 = f1.readlines() sentences_f2 = f2.readlines() # weights = tuple([1/float(n) for i in range(n)]) # print 'corpus_bleu: ', bleu_score.corpus_bleu(sentences_f1, sentences_f2, weights=weights) bleu_scores = [] sentence_pairs = zip(sentences_f1, sentences_f2) for pair in sentence_pairs: # Ground truth hypothesis reference = pair[0][:-1] # Generated hypothesis hypothesis = pair[1][:-1] score = bleu_score.modified_precision([reference], hypothesis, n) bleu_scores.append(float(score)) avg_bleu_score = sum(bleu_scores) / float(len(bleu_scores)) print "Average BLEU score: {}\n".format(avg_bleu_score)
def mp(targets, responses, n_bleu=1): """ Computes modified precision scores for a bunch of response-target pairs """ return np.array([float(bleu_score.modified_precision(targets[i], responses[i], n=n_bleu)) for i in range(len(targets))])
def get_modified_precision_scores(self, reference, hypothesis, n_max_=4): ps = [] for i in range(2, n_max_ + 1): p_i = modified_precision([reference], hypothesis, i) self.get_p_numerators()[i] = p_i.numerator self.get_p_denominators()[i] = p_i.denominator ps.append(p_i) return ps
def get_bleu_score(premise, hypothesis, n): try: score = float(bleu_score.modified_precision( [re.findall(r"\w+", hypothesis)], re.findall(r"\w+", premise), n )) return score except ZeroDivisionError: return 0.0
def corpus_nist(list_of_references, hypotheses, n=5): """ Calculate a single corpus-level NIST score (aka. system-level BLEU) for all the hypotheses and their respective references. :param references: a corpus of lists of reference sentences, w.r.t. hypotheses :type references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param n: highest n-gram order :type n: int """ # Before proceeding to compute NIST, perform sanity checks. assert len(list_of_references) == len( hypotheses), "The number of hypotheses and their reference(s) should be the same" # Key = ngram order, and value = no. of ngram matches. p_numerators = Counter() # Key = ngram order, and value = no. of ngram in ref. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in hyp. sysoutput_lengths = Counter() hyp_lengths, ref_lengths = 0, 0 # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(range(1, n + 1)): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Adds the no. of ngrams in the hypothesis. sysoutput_lengths[i] += len(hypothesis) - (i - 1) # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = nist_length_penalty(ref_lengths, hyp_lengths) # Collects the various precision values for the different ngram orders. p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(range(1, n + 1))] # Eqn 2 in Doddington (2002): # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ] info = [0 if p_n[i].numerator == 0 or p_n[i + 1].numerator == 0 # Handles math domain and zero division errors. else math.log(p_n[i].numerator / p_n[i + 1].numerator) for i in range(len(p_n) - 1)] return sum(info_i / sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
def custom_corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)): from collections import Counter from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, Fraction import math p_numerators = Counter( ) # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter( ) # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len( hypotheses ), "The number of hypotheses and their reference(s) should be the same" # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0, p_n, bp s = (w * math.log(p_i + 1e-12) for i, (w, p_i) in enumerate(zip(weights, p_n))) s = bp * math.exp(math.fsum(s)) return s
def corpus_nist(list_of_references, hypotheses, n=5): """ Calculate a single corpus-level NIST score (aka. system-level BLEU) for all the hypotheses and their respective references. :param references: a corpus of lists of reference sentences, w.r.t. hypotheses :type references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param n: highest n-gram order :type n: int """ # Before proceeding to compute NIST, perform sanity checks. assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same" p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. sysoutput_lengths = Counter() # Key = ngram order, and value = no. of ngram in hyp. hyp_lengths, ref_lengths = 0, 0 # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(range(1,n+1)): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Adds the no. of ngrams in the hypothesis. sysoutput_lengths[i] += len(hypothesis) - (i - 1) # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = nist_length_penalty(ref_lengths, hyp_lengths) # Collects the various precision values for the different ngram orders. p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(range(1,n+1))] # Eqn 2 in Doddington (2002): # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ] info = [0 if p_n[i].numerator == 0 or p_n[i+1].numerator == 0 # Handles math domain and zero division errors. else math.log(p_n[i].numerator / p_n[i+1].numerator) for i in range(len(p_n)-1)] return sum(info_i/sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
def test_sentence_bleu(self): sm = SmoothingFunction() reference1 = 'the cat is on the mat'.split() reference2 = 'there is a cat on the mat'.split() references = [reference1, reference2] hypothesis1 = 'the the the the the the the'.split( ) # known to 'trick' BLEU hypothesis2 = 'the cat the the the the mat'.split( ) # should still score much higher for BLEU-1 (but not BLEU-4) score1_mod = float(modified_precision(references, hypothesis1, n=1)) score1_bleu1 = sentence_bleu(references, hypothesis1, (1, 0, 0, 0)) score1_bleu1_method5 = sentence_bleu(references, hypothesis1, (1, 0, 0, 0), smoothing_function=sm.method5) score1_bleu4 = sentence_bleu(references, hypothesis1) print( "Scores for hypothesis1 modified=%f, BLEU-1=%f (method5 %f), BLEU-4=%f" % (score1_mod, score1_bleu1, score1_bleu1_method5, score1_bleu4)) score2_mod = float(modified_precision(references, hypothesis2, n=1)) score2_bleu1 = sentence_bleu(references, hypothesis2, (1, 0, 0, 0)) score2_bleu1_method5 = sentence_bleu(references, hypothesis2, (1, 0, 0, 0), smoothing_function=sm.method5) score2_bleu4 = sentence_bleu(references, hypothesis2) print( "Scores for hypothesis2 modified=%f, BLEU-1=%f (method5 %f), BLEU-4=%f" % (score2_mod, score2_bleu1, score2_bleu1_method5, score2_bleu4)) self.assertLess(score1_mod, 0.3) self.assertGreater(score1_bleu1, 0.0) self.assertLess(score1_bleu1, 0.3) self.assertGreater(score1_bleu1, 0.0) self.assertLess(score1_bleu4, 0.1) self.assertLess(score2_mod, 0.6) self.assertGreater(score2_bleu1, 0.0) self.assertLess(score2_bleu1, 0.6) self.assertGreater(score2_bleu4, 0.0) self.assertLess(score2_bleu4, 0.1)
def test_sentence_perfect1(self): reference1 = 'You eat a cheese sandwich'.split() reference2 = 'You\'re eating a cheese sandwich'.split() reference3 = 'You are eating a cheese sandwich'.split() references = [reference1, reference2, reference3] hypothesis1 = 'You eat a cheese sandwich'.split() score1_mod = float(modified_precision(references, hypothesis1, n=1)) score1_bleu1 = sentence_bleu([reference1], hypothesis1, (1, 0, 0, 0)) score1_bleu4 = sentence_bleu([reference1], hypothesis1) score_bleu4 = sentence_bleu(references, hypothesis1) print( "Scores for hypothesis modified=%f, BLEU-1=%f, BLEU-4=%f (all %f)" % (score1_mod, score1_bleu1, score1_bleu4, score_bleu4)) self.assertGreaterEqual(score1_bleu4, 1.0) self.assertGreaterEqual(score_bleu4, 1.0)
def cal_BLEU(hypp, reff, data, ngram, debug): hyp = [] for s in hypp: hyp.append(prepare_for_bleu(s, data, ngram)) ref = [] for s in reff: ref.append(prepare_for_bleu(s, data, ngram)) # Check URL below for this powerful package (modified_precision): # http://www.nltk.org/api/nltk.translate.html#nltk.translate.bleu_score.modified_precision bleu = [0.] * len(ngram) if debug: print(hyp) for s in hyp: #print(s) for i in range(len(ngram)): bleu[i] += round(modified_precision(ref, s, n=ngram[i]), 5) return [x / len(hyp) for x in bleu]
def tokenize_modified_precision(reference, candidate, n=3, verbose=False): """ :param reference: :param candidate: :param n: :return: # TODO: >>> tokenize_modified_precision("der String von a","der String von b", n=1 ) 0.75 """ if verbose: print("#" * 30 + "\r\ncandidate:\r\n", candidate, "\r\nreference:", reference) candidate, reference = tokenize(candidate), tokenize(reference) bleu = float(bleu_score.modified_precision([reference], candidate, n=n)) return bleu
def mark_text(filename): print(filename) textframe = pd.read_csv(textdir + filename) newsframe = pd.read_csv(newsdir + filename) newstext = [] mark = [] for item in newsframe['word_sequence']: if type(item) is not np.float: sentence = re.split(r'[//]', item) newstext.append(sentence) for item in textframe['word_sequence']: sentence = re.split(r'[//]', item) try: mark.append(float(modified_precision(references = newstext, hypothesis = sentence, n = 2))) except ZeroDivisionError: mark.append(0) textframe['f_score'] = mark # print(textframe[textframe['f_score'] != 0]['f_score']) textframe.to_csv(textdir + filename, index = False)
def batch_precision_parameters(self, references: List[np.ndarray], hypotheses: List[np.ndarray]) -> List[float]: """ Calculate modified precision per n_gram for input references and hypotheses combinations. Args: references: Ground truth sentences. hypotheses: Predicted sentences. Returns: List of sentence level bleu scores """ assert len(references) == len(hypotheses), ( "The number of hypotheses and their reference(s) should be the same ") sentence_level_scores = [] # Iterate through each hypothesis and their corresponding references. for reference, hypothesis in zip(references, hypotheses): # For each order of ngram, calculate the correct predicted words and # total predicted words for the corpus-level modified precision. reference = get_formated_reference(reference) hypothesis = get_formated_list(hypothesis) for i in range(1, self.n_gram + 1): p_i = modified_precision(reference, hypothesis, i) self.no_of_correct_predicted[i] += p_i.numerator self.no_of_total_predicted[i] += p_i.denominator sentence_level_scores.append(sentence_bleu(reference, hypothesis, self.weights, self.smoothing_function)) # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) self.total_hypotheses_length += hyp_len ref_lens = (len(ref) for ref in reference) self.total_references_length += min(ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)) return sentence_level_scores
def basic_precision_recall(r, h, display=False): p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. r_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. r_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. metrics = {"rc": 0, "rt": 0, "tp": 0, "tc": 0, "word": {}} if display: print("total utts={0:d}".format(len(r))) i=1 for references, hypothesis in zip(r, h): # if min([len(any_ref) for any_ref in references]) > 0: if len(hypothesis) > 0: p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator metrics["tc"] += p_i.numerator metrics["tp"] += p_i.denominator else: p_numerators[i] += 0 p_denominators[i] += 0 metrics["tc"] += 0 metrics["tp"] += 0 #print(p_i.numerator, p_i.denominator) tot_match = 0 tot_count = 0 max_recall_match, max_tp, max_t, max_word_level_details = count_match(references[0], hypothesis) max_recall = max_recall_match / max_t if max_t > 0 else 0 for curr_ref in references: curr_match, curr_tp, curr_t, curr_word_level_details = count_match(curr_ref, hypothesis) curr_recall = curr_match / curr_t if curr_t > 0 else 0 if curr_recall > max_recall: max_recall_match = curr_match max_t = curr_t max_recall = curr_recall max_word_level_details = curr_word_level_details r_numerators[i] += max_recall_match r_denominators[i] += max_t metrics["rc"] += max_recall_match metrics["rt"] += max_t for key in {"t","tp","tc"}: for w in max_word_level_details[key]: if w not in metrics["word"]: metrics["word"][w] = {"t": 0, "tp": 0, "tc": 0} metrics["word"][w][key] += max_word_level_details[key][w] prec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(p_numerators.values(), p_denominators.values())] rec = [(n / d) * 100 if d > 0 else 0 for n,d in zip(r_numerators.values(), r_denominators.values())] if display: print("{0:10s} | {1:>8s}".format("metric", "1-gram")) print("-"*54) print("{0:10s} | {1:8.2f}".format("precision", *prec)) print("{0:10s} | {1:8.2f}".format("recall", *rec)) return prec[0], rec[0], metrics
def ngram_overlap(s1, s2, n=1): w1 = s1.split(" ") w2 = s2.split(" ") return float(modified_precision([w1], w2, n))
def modified_corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False): """ modified from nltk.translate.bleu_score.corpus_bleu, returns 'multi-bleu.perl'-like intermediate results. Args: list_of_references: hypotheses: weights: smoothing_function: auto_reweigh: Returns: """ # Before proceeding to compute BLEU, perform sanity checks. p_numerators = Counter( ) # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter( ) # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len(hypotheses), f"The number of hypotheses and their reference(s) should be " \ f"the same: {len(list_of_references)} != {len(hypotheses)}" # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): weights = (1 / hyp_lengths, ) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0 # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_len) s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n)) s = bp * math.exp(math.fsum(s)) return s, p_n, bp, hyp_lengths / ref_lengths, hyp_lengths, ref_lengths
def test_modified_precision(self): """ Examples from the original BLEU paper http://www.aclweb.org/anthology/P02-1040.pdf """ # Example 1: the "the*" example. # Reference sentences. ref1 = 'the cat is on the mat'.split() ref2 = 'there is a cat on the mat'.split() # Hypothesis sentence(s). hyp1 = 'the the the the the the the'.split() references = [ref1, ref2] # Testing modified unigram precision. hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) assert (round(hyp1_unigram_precision, 4) == 0.2857) # With assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4) # Testing modified bigram precision. assert(float(modified_precision(references, hyp1, n=2)) == 0.0) # Example 2: the "of the" example. # Reference sentences ref1 = str('It is a guide to action that ensures that the military ' 'will forever heed Party commands').split() ref2 = str('It is the guiding principle which guarantees the military ' 'forces always being under the command of the Party').split() ref3 = str('It is the practical guide for the army always to heed ' 'the directions of the party').split() # Hypothesis sentence(s). hyp1 = 'of the'.split() references = [ref1, ref2, ref3] # Testing modified unigram precision. assert (float(modified_precision(references, hyp1, n=1)) == 1.0) # Testing modified bigram precision. assert(float(modified_precision(references, hyp1, n=2)) == 1.0) # Example 3: Proper MT outputs. hyp1 = str('It is a guide to action which ensures that the military ' 'always obeys the commands of the party').split() hyp2 = str('It is to insure the troops forever hearing the activity ' 'guidebook that party direct').split() references = [ref1, ref2, ref3] # Unigram precision. hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1)) # Test unigram precision with assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4) self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4) # Test unigram precision with rounding. assert (round(hyp1_unigram_precision, 4) == 0.9444) assert (round(hyp2_unigram_precision, 4) == 0.5714) # Bigram precision hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2)) hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2)) # Test bigram precision with assertAlmostEqual at 4 place precision. self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4) self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4) # Test bigram precision with rounding. assert (round(hyp1_bigram_precision, 4) == 0.5882) assert (round(hyp2_bigram_precision, 4) == 0.0769)
def get_bleu_score(candidate, reference, ngrams): candidate = candidate.split(' ') reference = [reference.split(' ')] return float(modified_precision(reference, candidate, ngrams))
return s # print test results i=0 j=81 answers=[] for x in range(0,20): answers.append([]) for p in predictions: print str(j)+". Expected output: "+to_sentence(sentences[i]) print str(j)+". Output: ", for w in p: if w!=0: print y_ix_to_word[w], answers[i].append(y_ix_to_word[w]) print "" print "" i+=1 j+= from nltk.translate.bleu_score import modified_precision # print BLEU scores for output sentences with reference to expected output. j=81 for i in range(0,20): bs = float(modified_precision(sentences[i], answers[i],1)) print "BLEU Score for video number "+str(j)+": "+str(bs) j+=1
def bleu(ref, hyp): return float(modified_precision([ref.split()], hyp.split(), n=1))
def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, averaging_mode="geometric", no_length_penalty=False): """ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all the hypotheses and their respective references. Instead of averaging the sentence level BLEU scores (i.e. marco-average precision), the original BLEU metric (Papineni et al. 2002) accounts for the micro-average precision (i.e. summing the numerators and denominators for each hypothesis-reference(s) pairs before the division). >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', ... 'interested', 'in', 'world', 'history'] >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', ... 'because', 'he', 'read', 'the', 'book'] >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] >>> hypotheses = [hyp1, hyp2] >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS 0.5920... The example below show that corpus_bleu() is different from averaging sentence_bleu() for hypotheses >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) >>> score2 = sentence_bleu([ref2a], hyp2) >>> (score1 + score2) / 2 # doctest: +ELLIPSIS 0.6223... :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses :type list_of_references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param weights: weights for unigrams, bigrams, trigrams and so on :type weights: list(float) :param smoothing_function: :type smoothing_function: SmoothingFunction :param auto_reweigh: Option to re-normalize the weights uniformly. :type auto_reweigh: bool :return: The corpus-level BLEU score. :rtype: float """ # Before proceeding to compute BLEU, perform sanity checks. p_numerators = Counter( ) # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter( ) # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len(hypotheses), ( "The number of hypotheses and their reference(s) should be the " "same ") # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. if no_length_penalty and averaging_mode == 'geometric': bp = 1.0 elif no_length_penalty and averaging_mode == 'arithmetic': bp = 0.0 else: assert not no_length_penalty assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode' bp = brevity_penalty(ref_lengths, hyp_lengths) # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): weights = (1 / hyp_lengths, ) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0 # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths) if averaging_mode == "geometric": s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n)) s = bp * math.exp(math.fsum(s)) elif averaging_mode == "arithmetic": s = (w_i * p_i for w_i, p_i in zip(weights, p_n)) s = math.fsum(s) return s
def calculate_bleu(name, dataset, q_field, a_field, as_field, QnA_vocab, model, device, max_len=50): results = [] precision_total = 0 score = 0 instances = len(dataset) for data in dataset: q = vars(data)['Q'] a = vars(data)['A'] a_sen = vars(data)['Ans_Sen'] if name == "CNN": hypothesis, _ = cnn_predict(q, a, q_field, a_field, as_field, QnA_vocab, model, device, max_len) elif name == "ATTN": hypothesis, _ = attn_predict(q, a, q_field, a_field, as_field, QnA_vocab, model, device, max_len) elif name == "COPYNET": hypothesis = copynet_predict(q, a, q_field, a_field, as_field, QnA_vocab, model, device, max_len) else: hypothesis = [] #cut off <eos> token hypothesis = hypothesis[:-1] reference = a_sen #reference = [t.lower() for t in a_sen] blue_score = example_score(reference, hypothesis) precision = float(modified_precision([reference], hypothesis, n=1)) #print(precision) results.append({ 'question': q, 'answer': a, 'reference': reference, 'hypothesis': hypothesis, 'blue_score': blue_score, 'precision': precision }) score += blue_score precision_total += precision if name == "CNN": with open('CNN_results.txt', 'w') as file: file.write(json.dumps(results)) elif name == "ATTN": with open('ATTN_results.txt', 'w') as file: file.write(json.dumps(results)) elif name == "COPYNET": with open('COPYNET_results.txt', 'w') as file: file.write(json.dumps(results)) return score / instances, precision_total / instances