def bleu_corpus(hypothesis, references): from nltk.translate.bleu_score import corpus_bleu hypothesis = hypothesis.copy() references = references.copy() hypothesis = [hyp.split() for hyp in hypothesis] references = [[ref.split()] for ref in references] # hypothesis = [normalize_answer(hyp).split(" ") for hyp in hypothesis] # references = [[normalize_answer(ref).split(" ")] for ref in references] b1 = corpus_bleu( references, hypothesis, weights=(1.0 / 1.0, ), smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1) b2 = corpus_bleu( references, hypothesis, weights=(1.0 / 2.0, 1.0 / 2.0), smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1) b3 = corpus_bleu( references, hypothesis, weights=(1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0), smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1) b4 = corpus_bleu( references, hypothesis, weights=(1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0), smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1) return (b1, b2, b3, b4)
def _bleu(guess, answers, method=None): """Compute approximate BLEU score between guess and a set of answers.""" if nltkbleu is None: # bleu library not installed, just return a default value return None # Warning: BLEU calculation *should* include proper tokenization and # punctuation etc. We're using the normalize_answer for everything though, # so we're over-estimating our BLEU scores. Also note that NLTK's bleu is # going to be slower than fairseq's (which is written in C), but fairseq's # requires that everything be in arrays of ints (i.e. as tensors). NLTK's # works with strings, which is better suited for this module. if method == "method0": smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method0 elif method == "method1": smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method1 elif method == "method2": smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method2 elif method == "method3": smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method3 elif method == "method4": smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method4 elif method == "method5": smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method5 elif method == "method6": smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method6 elif method == "method7": smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method7 else: smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method3 return nltkbleu.sentence_bleu( [normalize_answer(a).split(" ") for a in answers], normalize_answer(guess).split(" "), smoothing_function=smoothing_func, )
def __call__(self, trainer): with chainer.no_backprop_mode(): references = [] hypotheses = [] use_data = random.sample(self.test_data, self.batch) if self.device >= 0: sources = [cuda.cupy.asarray(x[1]) for x, _ in use_data] targets = [cuda.cupy.asarray(y[1]) for _, y in use_data] #sourcePersona = [cuda.cupy.asarray(x[0]) for x, _ in use_data]#今は使わん将来使うかも? targetPersona = [cuda.cupy.asarray(y[0]) for _, y in use_data] else: sources = [x[1] for x, _ in use_data] targets = [y[1] for _, y in use_data] sourcePersona = [x[0] for x, _ in use_data] targetPersona = [y[0] for _, y in use_data] sources = F.pad_sequence(sources, loadData.LoadData.maxlen, -1) targets = F.pad_sequence(targets, loadData.LoadData.maxlen, -1) references.extend([[t.tolist()] for t in targets.data]) ys = [y.tolist() for y in self.model.predict(sources.data, targetPersona)]#batch_size hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) chainer.report({self.key:bleu})
def distribScoreBleu(corpusDict, dicoTrad): """ Méthode qui calcule la distribution des scores BLEU pour chaque type de phrase, direct et indirect Arg : dicoScore = dictionnaires des scores pour chaque type key : type de phrase, value : score BLEU dicoPhrases = Dictionnaire des phrases direct et indirect Return : DicoScore """ chencherry = bleu.SmoothingFunction() dicoScore = defaultdict(list) dicoPhrases = dicoDirIndir(corpusDict) for directTrad in dicoTrad: dicoBleu = createDicoRefHyp(corpusDict, directTrad) for ref in dicoBleu.keys(): if dicoBleu[ref][0] in dicoPhrases['direct_hyp']: bleuScore = bleu.sentence_bleu( dicoBleu[ref], ref.split(), smoothing_function=chencherry.method3) dicoScore['direct_hyp'] += [bleuScore] elif dicoBleu[ref][0] in dicoPhrases['indirect_hyp']: bleuScore = bleu.sentence_bleu( dicoBleu[ref], ref.split(), smoothing_function=chencherry.method3) dicoScore['indirect_hyp'] += [bleuScore] return dicoScore
def scoreBleu(corpusDict, dicoTrad): """ Méthode qui calcule le score bleu de chaque direction de traduction et qui le stock dans le dictionnaire de traduction approprié. Arg : dicoBleu = Dictionnaire qui change à chaque nouvelle direction de traduction et qui est créée par createDicoRefHyp totalBleu = le nombre total de scoreBleu par direction de traduction Return : Le dictionnaire de traduction avec key : direction de traduction value : le score BLEU """ dicoBleu = {} chencherry = bleu.SmoothingFunction() for directTrad in dicoTrad: totalBleu = 0.0 dicoBleu = createDicoRefHyp(corpusDict, directTrad) for ref in dicoBleu.keys(): totalBleu += bleu.sentence_bleu( dicoBleu[ref], ref.split(), smoothing_function=chencherry.method3) dicoTrad[directTrad] = totalBleu / len(dicoBleu.keys()) return (dicoTrad)
def get_bleu(pred, trg, lengths, idx2word, BLEU=[0.25, 0.25, 0.25, 0.25]): batch = pred.shape[0] cc = bleu.SmoothingFunction() score = 0 b = 0 for p, t in zip(pred, trg): p = p[1:lengths[b]] p = [idx2word[index].lower() for index in p] t = t[t != 0] t = t[1:-1] t = [idx2word[index].lower() for index in t] b += 1 #print("pred :" , p) #print("target: ", t) score += bleu.sentence_bleu([t], p, BLEU, smoothing_function=cc.method1) return score
def get_ith_item(self, index, return_string=True): output = { "source": self._get_source_sentence(index, return_string=return_string), "reference": self._get_reference_sentence(index, return_string=return_string), "sampled": self._get_sampled_sentence(index, return_string=return_string), "attention": self._last_batch["attention"][index] } reference = output["reference"] hypothesis = output["sampled"] if not return_string: reference = " ".join(reference) hypothesis = " ".join(hypothesis) chencherry = bleu_score.SmoothingFunction() output["bleu-4"] = bleu_score.sentence_bleu( references=[reference], hypothesis=hypothesis, smoothing_function=chencherry.method1) return output
def calculate_bleu_score(caption_model, dataset_encoddings, lookup_table, word_to_idx, idx_to_word, max_length): example_count = 0 bleu_sum = 0 for image_key in tqdm(list(dataset_encoddings.keys()), position=0, leave=True): image = dataset_encoddings[image_key].reshape(1, IMAGE_OUTPUT_DIM) generated_caption = generate_caption(image, word_to_idx, idx_to_word, max_length, caption_model) candidate = generated_caption.split() references = list( map(lambda caption: caption.split(), lookup_table[image_key])) bleu_score = bleu.sentence_bleu( references, candidate, smoothing_function=bleu.SmoothingFunction().method1) bleu_sum += bleu_score if example_count < 5: log.warning( f'Image: {image_key}\nreal caption: {lookup_table[image_key][0]}\ngenerated caption: {generated_caption}\nBLEU: {bleu_score}' ) log.warning( "********************************************************************" ) example_count += 1 return bleu_sum / len(dataset_encoddings)
def __call__(self, trainer): print('## Calculate BLEU') with chainer.no_backprop_mode(): with chainer.using_config('train', False): references = [] hypotheses = [] for i in range(0, len(self.test_data), self.batch): sources, targets = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in targets]) sources = [ chainer.dataset.to_device(self.device, x) for x in sources ] ys = [ y.tolist() for y in self.model.translate( sources, self.max_length, beam=False) ] # greedy generation for efficiency hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) * 100 print('BLEU:', bleu) reporter.report({self.key: bleu})
def compute_scores(pred_fname, ref_fname): # read files hyps = read_corpus(pred_fname, ref=False) refs = read_corpus(ref_fname, ref=True) # NIST score nist = ns.corpus_nist(refs, hyps, n=4) # BLEU score chencherry = bs.SmoothingFunction() bleu = bs.corpus_bleu(refs, hyps, smoothing_function=chencherry.method2) # ED total_len = 0.0 edi = 0.0 for r, h in zip(refs, hyps): total_len += max(len(r[0]), len(h)) edi += edit_distance( r[0], h) # TODO: strange -- inputs should be strings, not charlists! bleu_score = 100.0 * round(bleu, 4) edist_score = 100.0 * round(1 - edi / total_len, 4) nist_score = round(nist, 2) return Scores(bleu_score, edist_score, nist_score)
def __call__(self, trainer): with chainer.no_backprop_mode(): references = [] hypotheses = [] for i in range(0, len(self.test_data), self.batch): sources0, sources1, targets = zip(*self.test_data[i:i + self.batch]) references.extend([[t.tolist()] for t in targets]) sources0 = [ chainer.dataset.to_device(self.device, x) for x in sources0 ] sources1 = [ chainer.dataset.to_device(self.device, x) for x in sources1 ] ys = [ y.tolist() for y in self.model.translate( sources0, sources1, self.max_length) ] hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) chainer.report({self.key: bleu})
def compute(guess: str, answers: List[str], k: int = 4) -> Optional[BleuMetric]: """ Compute approximate BLEU score between guess and a set of answers. """ try: from nltk.translate import bleu_score as nltkbleu except ImportError: # User doesn't have nltk installed, so we can't use it for bleu # We'll just turn off things, but we might want to warn the user return None # Warning: BLEU calculation *should* include proper tokenization and # punctuation etc. We're using the normalize_answer for everything though, # so we're over-estimating our BLEU scores. Also note that NLTK's bleu is # going to be slower than fairseq's (which is written in C), but fairseq's # requires that everything be in arrays of ints (i.e. as tensors). NLTK's # works with strings, which is better suited for this module. weights = [1 / k for _ in range(k)] score = nltkbleu.sentence_bleu( [normalize_answer(a).split(" ") for a in answers], normalize_answer(guess).split(" "), smoothing_function=nltkbleu.SmoothingFunction( epsilon=1e-12).method1, weights=weights, ) return BleuMetric(score)
def calc_bleu_many(cand_seq, ref_sequences): sf = bleu_score.SmoothingFunction() return bleu_score.sentence_bleu(ref_sequences, cand_seq, smoothing_function=sf.method1, weights=(0.5, 0.5))
def test_review_bleu(gts_data, generate_data, vocab, bleu_totals, length): type_wights = [[1., 0, 0, 0], [.5, .5, 0, 0], [1 / 3, 1 / 3, 1 / 3, 0], [.25, .25, .25, .25]] sf = bleu_score.SmoothingFunction() # batch first gts_idx = torch.transpose(gts_data, 0, 1) _, generate_idx = generate_data.max(2) generate_idx = torch.transpose(generate_idx, 0, 1) gts_sentence = [] gene_sentence = [] # detokenize the sentence for token_ids in gts_idx: current = [vocab.itos[id] for id in token_ids.detach().cpu().numpy()] gts_sentence.append(current) for token_ids in generate_idx: current = [vocab.itos[id] for id in token_ids.detach().cpu().numpy()] gene_sentence.append(current) # compute bleu score assert len(gts_sentence) == len(gene_sentence) for i in range(len(gts_sentence)): length += 1 for j in range(4): refs = gts_sentence[i] sample = gene_sentence[i] weights = type_wights[j] bleu_totals[j] += bleu_score.sentence_bleu( refs, sample, smoothing_function=sf.method1, weights=weights) return bleu_totals, length
def main(): arguments = sys.argv[1:] num_args = len(arguments) if num_args != 2: print('Wrong number few arguments.') print(str(sys.argv[0]), 'system-dir', 'reference-dir') exit() system_path = arguments[0] ref_path = arguments[1] # For all files in system path. for filename in os.listdir(system_path): print('Filename', str(filename)) system_filename = os.path.join(system_path, filename) ref_filename = os.path.join(ref_path, filename) # read files ref = read_corpus(ref_filename, ref=True) hyp = read_corpus(system_filename, ref=False) # NIST score nist = ns.corpus_nist(ref, hyp, n=4) # BLEU score chencherry = bs.SmoothingFunction() bleu = bs.corpus_bleu(ref, hyp, smoothing_function=chencherry.method2) print('BLEU', str(round(bleu, 3))) total_len = 0.0 edi = 0.0 for r, h in zip(ref, hyp): total_len += max(len(r[0]), len(h)) edi += edit_distance(r[0], h) print('DIST', str(round(1 - edi / total_len, 3))) print('NIST', str(round(nist, 6))) print()
def select_question(asked: list, questions: dict, lang: str) -> dict: nested_punct = re.compile('([a-zA-Z]+)['+string.punctuation+']+([a-zA-Z]+)') replace = lambda match:match.group(1) + ' ' + match.group(2) scores = {} # Check here if lang is not english, then run morpheme function on asked if so. for i, question in questions.items(): english, lang_b = question eng_q, eng_ans = list(english.items())[0] lang_b_q, lang_b_ans = list(lang_b.items())[0] if lang == 'english': hypothesis = [word.lower() for word in word_tokenize(eng_q, 'english') if word not in string.punctuation] else: # Here if possible, run lang_b_q.split() through a function that returns a list of morphemes. hypothesis = [word.lower() for word in lang_b_q.split() if word not in string.punctuation] fin_hypothesis = [] for word in hypothesis: fin_hypothesis += re.sub(nested_punct, replace, word).split() score = bleu_score.sentence_bleu([fin_hypothesis], asked, smoothing_function=bleu_score.SmoothingFunction().method1) scores[i] = [score, lang_b_q, lang_b_ans, eng_q, eng_ans] sort_scores = sorted(scores.items(), key=lambda score:score[1][0], reverse=True) return sort_scores[:2]
def CalculateBleu(trainer=trainer, model=model, test_data=test_data, key='validation/main/bleu', batch=100, max_length=100, device=args.gpu): trigger = 1, 'epoch' priority = chainer.training.PRIORITY_WRITER with chainer.no_backprop_mode(): references = [] hypotheses = [] for i in range(0, len(test_data), batch): sources, targets = zip(*test_data[i:i + batch]) references.extend([[t.tolist()] for t in targets]) sources = [ chainer.dataset.to_device(device, x) for x in sources ] ys = [ y.tolist() for y in model.translate(sources, max_length) ] hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) chainer.report({key: bleu}) print(bleu)
def CalculateBleu(model=model, test_data=test_data, batch=100, max_length=100, device=args.gpu): trigger = 1, 'epoch' priority = chainer.training.PRIORITY_WRITER with chainer.no_backprop_mode(): references = [] hypotheses = [] for i in range(0, len(test_data), batch): sources, targets = zip(*test_data[i:i + batch]) references.extend([[t.tolist()] for t in targets]) sources = [ chainer.dataset.to_device(device, x) for x in sources ] ys = [y.tolist() for y in model.translate(sources, max_length)] hypotheses.extend(ys) print(len(references)) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) print(bleu) # with open(path_w, mode='a') as f: # f.write('\n bleu : %f' % bleu) #
def compute_blue(searcher): """ This function computes the bleu score. :return: the blue score """ smoothing_fn = bleu_score.SmoothingFunction() score = 0 for idx, test_data in enumerate(test_dataloader): question = torch.transpose(test_data[0], 0, 1).to(device) answer = torch.transpose(test_data[1], 0, 1) answer = answer[1:] answer = answer.reshape(1, -1).tolist()[0] output = searcher(question) output = output[1:] prediction = output.topk(1).indices.reshape(1, -1).tolist()[0] prediction = convert_to_string(prediction) answer = convert_to_string(answer) answer = [answer.split()] prediction = prediction.split() score += bleu_score.sentence_bleu( answer, prediction, smoothing_function=smoothing_fn.method7, weights=(0.5, 0.5)) if idx % 2000 == 0: print('Sentences processed: [{}/{}]'.format( idx + 1, len(test_dataloader))) return (score / len(test_dataloader)) * 100
def calc_bleu_score_for_seqs(hypo_seq, ref_seqs): sf = bleu_score.SmoothingFunction() return bleu_score.sentence_bleu( references=ref_seqs, hypothesis=hypo_seq, smoothing_function=sf.method1, weights=(0.5, 0.5), )
def text_bleu(gold_txts, pred_txts): all_ref = [[txt.lower().split()] for txt in gold_txts] all_hyp = [txt.lower().split() for txt in pred_txts] chencherry = bs.SmoothingFunction() bleu = bs.corpus_bleu(all_ref, all_hyp, smoothing_function=chencherry.method2) return bleu
def belu_score(predicted_seq, reference_sequences): smoothing_fn = bleu_score.SmoothingFunction() reference_sequences = np.expand_dims(reference_sequences, axis=0) return bleu_score.sentence_bleu( reference_sequences, predicted_seq, smoothing_function=smoothing_fn.method1, weights=(0.5, 0.5))
def calculate_dataset_bleu(img_encoddings, word_to_idx, idx_to_word, max_length, caption_model, lookup_table): sat_captions = load_show_attend_and_tell_captions() sat_bleu_average = 0 bleu_average = 0 count = 0 for image_key in tqdm(img_encoddings.keys()): image_path = os.path.join(FLICKR_FOLDER, FLICKR_IMAGES_FOLDER, image_key + '.jpg') if os.path.exists(image_path): count += 1 image = img_encoddings[image_key].reshape(1, IMAGE_OUTPUT_DIM) generated_caption = generate_caption(image, word_to_idx, idx_to_word, max_length, caption_model) sat_caption = sat_captions[image_key] references = list( map(lambda caption: caption.split(), lookup_table[image_key])) generated_caption_bleu = bleu.sentence_bleu( references, generated_caption.split(), smoothing_function=bleu.SmoothingFunction().method3) bleu_average += generated_caption_bleu sat_caption_bleu = bleu.sentence_bleu( references, sat_caption.split(), smoothing_function=bleu.SmoothingFunction().method3) sat_bleu_average += sat_caption_bleu if count <= 2: print("Generated caption: ", generated_caption) print("SAT caption: ", sat_caption) print("Original caption: ", lookup_table[image_key][0]) print("SAT caption BLEU score: ", sat_caption_bleu) print("Generated caption BLEU score: ", generated_caption_bleu) x = plt.imread(image_path) plt.imshow(x) plt.show() print("_____________________________________") print('Model validation set BLEU: ', bleu_average / count) print('SAT validation set BLEU: ', sat_bleu_average / count)
def _bleu_n(n, ref_words_list, hyp_words): """Computes BLEU score up to n-gram.""" # Average weights across different n-grams. weights = [1.0 / n] * n return bleu_score.sentence_bleu( ref_words_list, hyp_words, weights=weights, smoothing_function=bleu_score.SmoothingFunction().method1)
def computeBLUEScore(self): blue_references = self.packTexts(self.breakTexts(self._references)) blue_candidates = self._candidates #self.breakTexts(self._candidates) #print(blue_references) #print(blue_candidates) self._blue_score = bleu_score.corpus_bleu( blue_references, blue_candidates, smoothing_function=bleu_score.SmoothingFunction().method1)
def get_bleu(targets, predictions): bleu_scores = 0 smooth = bleu_score.SmoothingFunction() for i in range(len(predictions)): score = bleu_score.sentence_bleu(targets[i], predictions[i], smoothing_function=smooth.method7) bleu_scores += score return float(bleu_scores / len(targets))
def bleu(ref_list, candidateText, nGram=4, nGramType='cumulative', shouldSmooth=True): '''calculates BLEU score _parameters_ ref_list: expects a list of reference texts to compare (as strings) candidateText: the new text needing to be scored nGram: choose between 1-4. Determines which ngram(s) to use in the scoring nGramType: 'cumulative' uses a simple average of all ngrams from 1 to nGram shouldSmooth: if False, calculates the BLEU score without smoothing. Recommended to use smoothing (set to True) _returns_ score: BLEU score using nGram settings input, smoothed by default (can be turned off) ''' # basic checks if nGram not in [1, 2, 3, 4]: raise ValueError('nGram must be between 1 and 4') if nGramType not in ['cumulative', 'exclusive']: raise ValueError( 'nGramType must either be cumulative (average of nGrams less than n) or exclusive (1=unigram, etc.)' ) # pre-score weight_dict = { ('cumulative', 1): (1, 0, 0, 0), ('cumulative', 2): (.5, .5, 0, 0), ('cumulative', 3): (.3333, .3333, .3333, 0), ('cumulative', 4): (.25, .25, .25, .25), ('exclusive', 1): (1, 0, 0, 0), ('exclusive', 2): (0, 1, 0, 0), ('exclusive', 3): (0, 0, 1, 0), ('exclusive', 4): (0, 0, 0, 1) } candidate = [removePunc(str(removeMarkupWords(candidateText))).split()] references = [[ removePunc(str(removeMarkupWords(ref))).split() for ref in ref_list ]] weights = weight_dict[(nGramType, nGram)] # scoring if shouldSmooth == True: smoother = bleu_score.SmoothingFunction().method7 else: smoother = None score = bleu_score.corpus_bleu(references, candidate, weights, smoothing_function=smoother) #print(score) return score
def evaluate_bleu(dataloader, generator, eval_num=32, mode='sentence'): ref_list = [] gen_list = [] dataloader.test_init() if mode == 'corpus': while not dataloader.test_finished(): batch = dataloader.test_next_batch() ref_list.extend(batch.astype(np.str).tolist()) gen = generator.generate() gen_list.extend(gen.astype(np.str).tolist()) return bleu.corpus_bleu( ref_list, gen_list, smoothing_function=bleu.SmoothingFunction().method4) else: while not dataloader.test_finished(): batch = dataloader.test_next_batch() ref_list.extend(batch.astype(np.str).tolist()) for i in range(eval_num // batch_size): gen = generator.generate() gen_list.extend(gen.astype(np.str).tolist()) score8, score12 = 0., 0. for i, gen in enumerate(gen_list): x8 = bleu.sentence_bleu( ref_list, gen, weights=(1.0 / 8, ) * 8, smoothing_function=bleu.SmoothingFunction().method4) x12 = bleu.sentence_bleu( ref_list, gen, weights=(1.0 / 12, ) * 12, smoothing_function=bleu.SmoothingFunction().method4) score8 += x8 score12 += x12 return score8 / len(gen_list), score12 / len(gen_list)
def test(modelPath): settings.sysAsserts() dataset = COCODataset('coco/images/val2017', 'coco/annotations/captions_val2017.json', True) with open(settings.vocabfilepath, 'rb') as dsf: pcklr = pickle.Unpickler(dsf) obj = pcklr.load() vocab = obj.get('vocab') maxCaptionLen = obj.get('maxCaptionLen') wordToInd = obj.get('wordToInd') cnn, rnn, outputNet = archs.CNN.ResNet34, archs.RNN.LSTM_2l, archs.OutputNet.FC1l model = archs.Architecture(cnn, rnn, outputNet, len(vocab), maxCaptionLen + MainHyperparams().maxCaptionLenDelta, modelPath) loss = .0 references = [] hypotheses = [] chencherry = bleu.SmoothingFunction() model.eval() with torch.no_grad(): for i in range(len(dataset.coco)): image, captions = dataset.coco[i] image = image.unsqueeze(0).to(settings.device) captions = [c.lower() for c in captions] outputSeqs, outputLogProbs = model(image) outputSeq = [utils.indicesToSentence(vocab, outputSeqs[0])] loss += criterion(outputLogProbs, outputSeq, wordToInd) references.append(captions) hypotheses.append(outputSeq) if i % 100 == 0: print(f'Evaluated {i + 1}/{len(dataset.coco)}') print(f'-----{(dataset.coco.coco.imgs[dataset.coco.ids[i]])["file_name"]}') print(f'Truth: {[c.capitalize() for c in captions]}') print(f'Prediction: {outputSeq[0].capitalize()}') print("\n") loss /= len(dataset.coco) bleu2 = bleu.corpus_bleu(references, hypotheses, weights=(0.5, 0.5), smoothing_function=chencherry.method1) bleu3 = bleu.corpus_bleu(references, hypotheses, weights=(1. / 3., 1. / 3., 1. / 3.), smoothing_function=chencherry.method1) bleu4 = bleu.corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1) print('--------------------------------------------------------------') print(f'Testing dataset consists of {len(dataset.coco)} samples') print(f'Negative log-likelihood loss = {loss.item()}') print(f'Perplexity = {2 ** loss.item()}') print(f'BLEU-2 score = {bleu2}') print(f'BLEU-3 score = {bleu3}') print(f'BLEU-4 score = {bleu4}')
def __init__(self, N_gram=4, precision=2): from nltk.translate import bleu_score self.bleu_score = bleu_score self.N_gram = N_gram self._precision = precision self._weights = [ tuple([round(1 / i, 4)] * i + [0.] * (N_gram - i)) for i in range(1, N_gram + 1) ] self.smoothing_fn = bleu_score.SmoothingFunction().method3 self.reset()